[llvm] [AMDGPU] Ensure divergence for v_alignbit (PR #129159)

Jeffrey Byrnes via llvm-commits llvm-commits at lists.llvm.org
Fri Sep 26 10:09:24 PDT 2025


https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/129159

>From 47c7afddfd094bb4219d72a435f1205f9c4622bd Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 25 Sep 2025 15:10:49 -0700
Subject: [PATCH 1/2] Precommit autogen checks

Change-Id: I72dcb47e39e5d78a0d770ddcd0fa1023b30a0eab
---
 llvm/test/CodeGen/AMDGPU/alignbit-pat.ll | 126 +++++++++++++++++------
 1 file changed, 97 insertions(+), 29 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/alignbit-pat.ll b/llvm/test/CodeGen/AMDGPU/alignbit-pat.ll
index e65f401bcf68a..86b12fdd352a3 100644
--- a/llvm/test/CodeGen/AMDGPU/alignbit-pat.ll
+++ b/llvm/test/CodeGen/AMDGPU/alignbit-pat.ll
@@ -1,11 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
 
-; GCN-LABEL: {{^}}alignbit_shr_pat:
-; GCN-DAG: s_load_dword s[[SHR:[0-9]+]]
-; GCN-DAG: load_dwordx2 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
-; GCN: v_alignbit_b32 v{{[0-9]+}}, v[[HI]], v[[LO]], s[[SHR]]
-
 define amdgpu_kernel void @alignbit_shr_pat(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1, i32 %arg2) {
+; GCN-LABEL: alignbit_shr_pat:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_load_dword s8, s[4:5], 0xd
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    s_mov_b32 s4, s2
+; GCN-NEXT:    s_mov_b32 s5, s3
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, s8
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
 bb:
   %tmp = load i64, ptr addrspace(1) %arg, align 8
   %tmp3 = and i32 %arg2, 31
@@ -16,12 +28,24 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}alignbit_shr_pat_v:
-; GCN-DAG: load_dword v[[SHR:[0-9]+]],
-; GCN-DAG: load_dwordx2 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
-; GCN: v_alignbit_b32 v{{[0-9]+}}, v[[HI]], v[[LO]], v[[SHR]]
-
 define amdgpu_kernel void @alignbit_shr_pat_v(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
+; GCN-LABEL: alignbit_shr_pat_v:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GCN-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v0, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_alignbit_b32 v0, v4, v3, v0
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    s_endpgm
 bb:
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep1 = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %tid
@@ -36,12 +60,24 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}alignbit_shr_pat_wrong_and30:
-; Negative test, wrong constant
-; GCN: v_lshr_b64
-; GCN-NOT: v_alignbit_b32
-
 define amdgpu_kernel void @alignbit_shr_pat_wrong_and30(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1, i32 %arg2) {
+; GCN-LABEL: alignbit_shr_pat_wrong_and30:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_load_dword s8, s[4:5], 0xd
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    s_mov_b32 s4, s2
+; GCN-NEXT:    s_mov_b32 s5, s3
+; GCN-NEXT:    s_and_b32 s0, s8, 30
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshr_b64 v[0:1], v[0:1], s0
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
 bb:
   %tmp = load i64, ptr addrspace(1) %arg, align 8
   %tmp3 = and i32 %arg2, 30
@@ -52,12 +88,23 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}alignbit_shr_pat_wrong_and63:
-; Negative test, wrong constant
-; GCN: v_lshr_b64
-; GCN-NOT: v_alignbit_b32
-
 define amdgpu_kernel void @alignbit_shr_pat_wrong_and63(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1, i32 %arg2) {
+; GCN-LABEL: alignbit_shr_pat_wrong_and63:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_load_dword s8, s[4:5], 0xd
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    s_mov_b32 s4, s2
+; GCN-NEXT:    s_mov_b32 s5, s3
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshr_b64 v[0:1], v[0:1], s8
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
 bb:
   %tmp = load i64, ptr addrspace(1) %arg, align 8
   %tmp3 = and i32 %arg2, 63
@@ -68,11 +115,22 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}alignbit_shr_pat_const30:
-; GCN: load_dwordx2 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
-; GCN: v_alignbit_b32 v{{[0-9]+}}, v[[HI]], v[[LO]], 30
-
 define amdgpu_kernel void @alignbit_shr_pat_const30(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
+; GCN-LABEL: alignbit_shr_pat_const30:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    s_mov_b32 s4, s2
+; GCN-NEXT:    s_mov_b32 s5, s3
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 30
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
 bb:
   %tmp = load i64, ptr addrspace(1) %arg, align 8
   %tmp5 = lshr i64 %tmp, 30
@@ -81,12 +139,22 @@ bb:
   ret void
 }
 
-; GCN-LABEL: {{^}}alignbit_shr_pat_wrong_const33:
-; Negative test, shift amount more than 31
-; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
-; GCN-NOT: v_alignbit_b32
-
 define amdgpu_kernel void @alignbit_shr_pat_wrong_const33(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
+; GCN-LABEL: alignbit_shr_pat_wrong_const33:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s4, s2
+; GCN-NEXT:    s_mov_b32 s5, s3
+; GCN-NEXT:    s_mov_b32 s2, s6
+; GCN-NEXT:    s_mov_b32 s3, s7
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
 bb:
   %tmp = load i64, ptr addrspace(1) %arg, align 8
   %tmp5 = lshr i64 %tmp, 33

>From cbab59c7f4e53ccca34919f515887535bfef66bb Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 25 Sep 2025 12:37:15 -0700
Subject: [PATCH 2/2] [AMDGPU] Ensure divergence for trunc -> v_alignbit
 pattern

Change-Id: Ie919caa7e7707ab3102f2e352eae83edbb27dbf5
---
 llvm/lib/Target/AMDGPU/SIInstructions.td      |    14 +-
 llvm/test/CodeGen/AMDGPU/alignbit-pat.ll      |     5 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll  | 26449 +++++++++-------
 .../CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll   |  1173 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll   |   150 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll   |   280 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll   |   205 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll   |  2469 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll   |   300 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll   |  3884 +--
 .../CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll   |   416 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll   |   726 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll   |   843 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll    |    38 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll   |  8423 ++---
 .../CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll   |  1669 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll   |  1825 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll    |   579 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll   |  2055 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll   |  2280 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll   |  2601 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll   |  2839 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll   |  3191 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll    |   641 +-
 .../AMDGPU/amdgpu-codegenprepare-idiv.ll      |   750 +-
 .../branch-folding-implicit-def-subreg.ll     |   591 +-
 .../CodeGen/AMDGPU/call-argument-types.ll     |     8 +-
 .../CodeGen/AMDGPU/cgp-bitfield-extract.ll    |     7 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll    |    68 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll     |    28 +-
 llvm/test/CodeGen/AMDGPU/function-returns.ll  |    10 +-
 .../CodeGen/AMDGPU/insert-delay-alu-bug.ll    |    16 +-
 .../AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll    |    58 +-
 llvm/test/CodeGen/AMDGPU/mul_int24.ll         |    17 +-
 llvm/test/CodeGen/AMDGPU/sdiv64.ll            |    70 +-
 .../test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll |     4 +-
 llvm/test/CodeGen/AMDGPU/srem64.ll            |    82 +-
 llvm/test/CodeGen/AMDGPU/udiv64.ll            |    32 +-
 llvm/test/CodeGen/AMDGPU/uniform-alignbit.ll  |    38 +
 llvm/test/CodeGen/AMDGPU/urem64.ll            |    24 +-
 40 files changed, 34123 insertions(+), 30735 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/uniform-alignbit.ll

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index d4c1bc6d84384..e1f1788726ad7 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2660,11 +2660,11 @@ let True16Predicate = NotHasTrue16BitInsts in {
 let SubtargetPredicate = isNotGFX9Plus in {
 def : ROTRPattern <V_ALIGNBIT_B32_e64>;
 
-def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
+def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))),
           (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
                           (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
 
-def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
+def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
           (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
                           (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
 } // isNotGFX9Plus
@@ -2678,8 +2678,8 @@ def : GCNPat <
                                   $src1, /* clamp */ 0, /* op_sel */ 0)
 >;
 
-foreach pat = [(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
-               (i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1))))] in
+foreach pat = [(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))),
+               (i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:$src1))))] in
 def : GCNPat<pat,
         (V_ALIGNBIT_B32_opsel_e64 0, /* src0_modifiers */
                                   (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
@@ -2708,7 +2708,7 @@ def : GCNPat <
                           /* clamp */ 0, /* op_sel */ 0)
 >;
 
-def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
+def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
           (V_ALIGNBIT_B32_t16_e64 0, /* src0_modifiers */
                           (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
                           0, /* src1_modifiers */
@@ -2734,7 +2734,7 @@ def : GCNPat <
                              $src1, /* clamp */ 0, /* op_sel */ 0)
 >;
 
-def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
+def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))),
      (V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */
                                (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
                                 0, /* src1_modifiers */
@@ -2743,7 +2743,7 @@ def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
                                 $src1, /* clamp */ 0, /* op_sel */ 0)
 >;
 
-def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
+def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
      (V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */
                                (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
                                 0, /* src1_modifiers */
diff --git a/llvm/test/CodeGen/AMDGPU/alignbit-pat.ll b/llvm/test/CodeGen/AMDGPU/alignbit-pat.ll
index 86b12fdd352a3..7fcb29d367006 100644
--- a/llvm/test/CodeGen/AMDGPU/alignbit-pat.ll
+++ b/llvm/test/CodeGen/AMDGPU/alignbit-pat.ll
@@ -14,8 +14,9 @@ define amdgpu_kernel void @alignbit_shr_pat(ptr addrspace(1) nocapture readonly
 ; GCN-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_mov_b32 s4, s2
 ; GCN-NEXT:    s_mov_b32 s5, s3
+; GCN-NEXT:    s_and_b32 s0, s8, 31
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, s8
+; GCN-NEXT:    v_lshr_b64 v[0:1], v[0:1], s0
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 bb:
@@ -128,7 +129,7 @@ define amdgpu_kernel void @alignbit_shr_pat_const30(ptr addrspace(1) nocapture r
 ; GCN-NEXT:    s_mov_b32 s4, s2
 ; GCN-NEXT:    s_mov_b32 s5, s3
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 30
+; GCN-NEXT:    v_lshr_b64 v[0:1], v[0:1], 30
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 8424917b2d604..815b9f294be8f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -7526,831 +7526,1167 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
 ; SI-LABEL: bitcast_v32i32_to_v128i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v41, s30, 0
-; SI-NEXT:    v_writelane_b32 v41, s31, 1
-; SI-NEXT:    v_writelane_b32 v41, s34, 2
-; SI-NEXT:    v_writelane_b32 v41, s35, 3
-; SI-NEXT:    v_writelane_b32 v41, s36, 4
-; SI-NEXT:    v_writelane_b32 v41, s37, 5
-; SI-NEXT:    v_writelane_b32 v41, s38, 6
-; SI-NEXT:    v_writelane_b32 v41, s39, 7
-; SI-NEXT:    v_writelane_b32 v41, s48, 8
-; SI-NEXT:    v_writelane_b32 v41, s49, 9
-; SI-NEXT:    v_writelane_b32 v41, s50, 10
-; SI-NEXT:    v_writelane_b32 v41, s51, 11
-; SI-NEXT:    v_writelane_b32 v41, s52, 12
-; SI-NEXT:    v_writelane_b32 v41, s53, 13
-; SI-NEXT:    v_writelane_b32 v41, s54, 14
-; SI-NEXT:    v_writelane_b32 v41, s55, 15
-; SI-NEXT:    v_writelane_b32 v41, s64, 16
-; SI-NEXT:    v_writelane_b32 v41, s65, 17
-; SI-NEXT:    v_writelane_b32 v41, s66, 18
-; SI-NEXT:    v_writelane_b32 v41, s67, 19
-; SI-NEXT:    v_writelane_b32 v41, s68, 20
-; SI-NEXT:    v_writelane_b32 v41, s69, 21
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_writelane_b32 v20, s30, 0
+; SI-NEXT:    v_writelane_b32 v20, s31, 1
+; SI-NEXT:    v_writelane_b32 v20, s34, 2
+; SI-NEXT:    v_writelane_b32 v20, s35, 3
+; SI-NEXT:    v_writelane_b32 v20, s36, 4
+; SI-NEXT:    v_writelane_b32 v20, s37, 5
+; SI-NEXT:    v_writelane_b32 v20, s38, 6
+; SI-NEXT:    v_writelane_b32 v20, s39, 7
+; SI-NEXT:    v_writelane_b32 v20, s48, 8
+; SI-NEXT:    v_writelane_b32 v20, s49, 9
+; SI-NEXT:    v_writelane_b32 v20, s50, 10
+; SI-NEXT:    v_writelane_b32 v20, s51, 11
+; SI-NEXT:    v_writelane_b32 v20, s52, 12
+; SI-NEXT:    v_writelane_b32 v20, s53, 13
+; SI-NEXT:    v_writelane_b32 v20, s54, 14
+; SI-NEXT:    v_writelane_b32 v20, s55, 15
+; SI-NEXT:    v_writelane_b32 v20, s64, 16
+; SI-NEXT:    v_writelane_b32 v20, s65, 17
+; SI-NEXT:    v_writelane_b32 v20, s66, 18
+; SI-NEXT:    v_writelane_b32 v20, s67, 19
+; SI-NEXT:    v_writelane_b32 v20, s68, 20
+; SI-NEXT:    v_writelane_b32 v20, s69, 21
+; SI-NEXT:    v_writelane_b32 v20, s70, 22
+; SI-NEXT:    v_writelane_b32 v20, s71, 23
+; SI-NEXT:    v_writelane_b32 v20, s80, 24
+; SI-NEXT:    v_writelane_b32 v20, s81, 25
+; SI-NEXT:    v_writelane_b32 v20, s82, 26
+; SI-NEXT:    v_writelane_b32 v20, s83, 27
+; SI-NEXT:    v_writelane_b32 v20, s84, 28
+; SI-NEXT:    v_writelane_b32 v20, s85, 29
+; SI-NEXT:    v_writelane_b32 v20, s86, 30
+; SI-NEXT:    v_writelane_b32 v20, s87, 31
+; SI-NEXT:    v_writelane_b32 v20, s96, 32
+; SI-NEXT:    v_writelane_b32 v20, s97, 33
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v19
-; SI-NEXT:    v_writelane_b32 v41, s70, 22
-; SI-NEXT:    v_readfirstlane_b32 s47, v1
-; SI-NEXT:    v_readfirstlane_b32 s46, v2
-; SI-NEXT:    v_readfirstlane_b32 s45, v3
-; SI-NEXT:    v_readfirstlane_b32 s44, v4
-; SI-NEXT:    v_readfirstlane_b32 s43, v5
-; SI-NEXT:    v_readfirstlane_b32 s42, v6
-; SI-NEXT:    v_readfirstlane_b32 s41, v7
-; SI-NEXT:    v_readfirstlane_b32 s40, v8
-; SI-NEXT:    v_readfirstlane_b32 s15, v9
-; SI-NEXT:    v_readfirstlane_b32 s14, v10
-; SI-NEXT:    v_readfirstlane_b32 s13, v11
-; SI-NEXT:    v_readfirstlane_b32 s12, v12
-; SI-NEXT:    v_readfirstlane_b32 s11, v13
-; SI-NEXT:    v_readfirstlane_b32 s10, v14
-; SI-NEXT:    v_readfirstlane_b32 s9, v15
-; SI-NEXT:    v_readfirstlane_b32 s8, v16
-; SI-NEXT:    v_readfirstlane_b32 s7, v17
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_readfirstlane_b32 s6, v18
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_writelane_b32 v41, s71, 23
+; SI-NEXT:    v_writelane_b32 v20, s98, 34
+; SI-NEXT:    v_readfirstlane_b32 s44, v1
+; SI-NEXT:    v_readfirstlane_b32 s45, v2
+; SI-NEXT:    v_readfirstlane_b32 s42, v3
+; SI-NEXT:    v_readfirstlane_b32 s43, v4
+; SI-NEXT:    v_readfirstlane_b32 s40, v5
+; SI-NEXT:    v_readfirstlane_b32 s41, v6
+; SI-NEXT:    v_readfirstlane_b32 s14, v7
+; SI-NEXT:    v_readfirstlane_b32 s15, v8
+; SI-NEXT:    v_readfirstlane_b32 s12, v9
+; SI-NEXT:    v_readfirstlane_b32 s13, v10
+; SI-NEXT:    v_readfirstlane_b32 s10, v11
+; SI-NEXT:    v_readfirstlane_b32 s11, v12
+; SI-NEXT:    v_readfirstlane_b32 s8, v13
+; SI-NEXT:    v_readfirstlane_b32 s9, v14
+; SI-NEXT:    v_readfirstlane_b32 s6, v15
+; SI-NEXT:    v_readfirstlane_b32 s7, v16
+; SI-NEXT:    v_readfirstlane_b32 s4, v17
+; SI-NEXT:    s_and_b64 s[46:47], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s5, v18
+; SI-NEXT:    v_writelane_b32 v20, s99, 35
+; SI-NEXT:    ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
+; SI-NEXT:    ; implicit-def: $vgpr21 : SGPR spill to VGPR lane
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v22, s45
-; SI-NEXT:    v_mov_b32_e32 v3, s7
-; SI-NEXT:    v_mov_b32_e32 v6, s9
-; SI-NEXT:    v_mov_b32_e32 v9, s11
-; SI-NEXT:    v_mov_b32_e32 v12, s13
-; SI-NEXT:    v_mov_b32_e32 v15, s15
-; SI-NEXT:    v_mov_b32_e32 v18, s41
-; SI-NEXT:    v_mov_b32_e32 v21, s43
-; SI-NEXT:    v_alignbit_b32 v24, s44, v22, 24
-; SI-NEXT:    v_alignbit_b32 v25, s44, v22, 16
-; SI-NEXT:    v_alignbit_b32 v26, s44, v22, 8
-; SI-NEXT:    v_mov_b32_e32 v22, s47
-; SI-NEXT:    v_mov_b32_e32 v23, s28
-; SI-NEXT:    v_mov_b32_e32 v29, s26
-; SI-NEXT:    v_mov_b32_e32 v35, s24
-; SI-NEXT:    v_mov_b32_e32 v39, s22
-; SI-NEXT:    v_mov_b32_e32 v50, s20
-; SI-NEXT:    v_mov_b32_e32 v53, s18
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v40, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v3, 24
-; SI-NEXT:    v_alignbit_b32 v2, s6, v3, 16
-; SI-NEXT:    v_alignbit_b32 v3, s6, v3, 8
-; SI-NEXT:    v_alignbit_b32 v4, s8, v6, 24
-; SI-NEXT:    v_alignbit_b32 v5, s8, v6, 16
-; SI-NEXT:    v_alignbit_b32 v6, s8, v6, 8
-; SI-NEXT:    v_alignbit_b32 v7, s10, v9, 24
-; SI-NEXT:    v_alignbit_b32 v8, s10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v9, s10, v9, 8
-; SI-NEXT:    v_alignbit_b32 v10, s12, v12, 24
-; SI-NEXT:    v_alignbit_b32 v11, s12, v12, 16
-; SI-NEXT:    v_alignbit_b32 v12, s12, v12, 8
-; SI-NEXT:    v_alignbit_b32 v13, s14, v15, 24
-; SI-NEXT:    v_alignbit_b32 v14, s14, v15, 16
-; SI-NEXT:    v_alignbit_b32 v15, s14, v15, 8
-; SI-NEXT:    v_alignbit_b32 v16, s40, v18, 24
-; SI-NEXT:    v_alignbit_b32 v17, s40, v18, 16
-; SI-NEXT:    v_alignbit_b32 v18, s40, v18, 8
-; SI-NEXT:    v_alignbit_b32 v19, s42, v21, 24
-; SI-NEXT:    v_alignbit_b32 v20, s42, v21, 16
-; SI-NEXT:    v_alignbit_b32 v21, s42, v21, 8
-; SI-NEXT:    v_alignbit_b32 v30, s46, v22, 24
-; SI-NEXT:    v_alignbit_b32 v31, s46, v22, 16
-; SI-NEXT:    v_alignbit_b32 v32, s46, v22, 8
-; SI-NEXT:    v_alignbit_b32 v36, s29, v23, 24
-; SI-NEXT:    v_alignbit_b32 v22, s29, v23, 16
-; SI-NEXT:    v_alignbit_b32 v23, s29, v23, 8
-; SI-NEXT:    v_alignbit_b32 v27, s27, v29, 24
-; SI-NEXT:    v_alignbit_b32 v28, s27, v29, 16
-; SI-NEXT:    v_alignbit_b32 v29, s27, v29, 8
-; SI-NEXT:    v_alignbit_b32 v33, s25, v35, 24
-; SI-NEXT:    v_alignbit_b32 v34, s25, v35, 16
-; SI-NEXT:    v_alignbit_b32 v35, s25, v35, 8
-; SI-NEXT:    v_alignbit_b32 v37, s23, v39, 24
-; SI-NEXT:    v_alignbit_b32 v38, s23, v39, 16
-; SI-NEXT:    v_alignbit_b32 v39, s23, v39, 8
-; SI-NEXT:    v_alignbit_b32 v48, s21, v50, 24
-; SI-NEXT:    v_alignbit_b32 v49, s21, v50, 16
-; SI-NEXT:    v_alignbit_b32 v50, s21, v50, 8
-; SI-NEXT:    v_alignbit_b32 v51, s19, v53, 24
-; SI-NEXT:    v_alignbit_b32 v52, s19, v53, 16
-; SI-NEXT:    v_alignbit_b32 v53, s19, v53, 8
-; SI-NEXT:    v_alignbit_b32 v54, s17, v40, 24
-; SI-NEXT:    v_alignbit_b32 v55, s17, v40, 16
-; SI-NEXT:    v_alignbit_b32 v40, s17, v40, 8
-; SI-NEXT:    s_lshr_b32 s56, s6, 24
-; SI-NEXT:    s_lshr_b32 s57, s6, 16
-; SI-NEXT:    s_lshr_b32 s58, s6, 8
-; SI-NEXT:    s_lshr_b32 s59, s8, 24
-; SI-NEXT:    s_lshr_b32 s60, s8, 16
-; SI-NEXT:    s_lshr_b32 s61, s8, 8
-; SI-NEXT:    s_lshr_b32 s62, s10, 24
-; SI-NEXT:    s_lshr_b32 s63, s10, 16
-; SI-NEXT:    s_lshr_b32 s72, s10, 8
-; SI-NEXT:    s_lshr_b32 s73, s12, 24
-; SI-NEXT:    s_lshr_b32 s74, s12, 16
-; SI-NEXT:    s_lshr_b32 s75, s12, 8
-; SI-NEXT:    s_lshr_b32 s76, s14, 24
-; SI-NEXT:    s_lshr_b32 s77, s14, 16
-; SI-NEXT:    s_lshr_b32 s78, s14, 8
-; SI-NEXT:    s_lshr_b32 s79, s40, 24
-; SI-NEXT:    s_lshr_b32 s88, s40, 16
-; SI-NEXT:    s_lshr_b32 s89, s40, 8
-; SI-NEXT:    s_lshr_b32 s90, s42, 24
-; SI-NEXT:    s_lshr_b32 s91, s42, 16
-; SI-NEXT:    s_lshr_b32 s92, s42, 8
-; SI-NEXT:    s_lshr_b32 s93, s44, 24
-; SI-NEXT:    s_lshr_b32 s94, s44, 16
-; SI-NEXT:    s_lshr_b32 s95, s44, 8
-; SI-NEXT:    s_lshr_b32 s30, s46, 24
-; SI-NEXT:    s_lshr_b32 s31, s46, 16
-; SI-NEXT:    s_lshr_b32 s34, s46, 8
-; SI-NEXT:    s_lshr_b32 s35, s29, 24
-; SI-NEXT:    s_lshr_b32 s36, s29, 16
-; SI-NEXT:    s_lshr_b32 s37, s29, 8
-; SI-NEXT:    s_lshr_b32 s38, s27, 24
-; SI-NEXT:    s_lshr_b32 s39, s27, 16
-; SI-NEXT:    s_lshr_b32 s48, s27, 8
-; SI-NEXT:    s_lshr_b32 s49, s25, 24
-; SI-NEXT:    s_lshr_b32 s50, s25, 16
-; SI-NEXT:    s_lshr_b32 s51, s25, 8
-; SI-NEXT:    s_lshr_b32 s52, s23, 24
-; SI-NEXT:    s_lshr_b32 s53, s23, 16
-; SI-NEXT:    s_lshr_b32 s54, s23, 8
-; SI-NEXT:    s_lshr_b32 s55, s21, 24
-; SI-NEXT:    s_lshr_b32 s64, s21, 16
-; SI-NEXT:    s_lshr_b32 s65, s21, 8
-; SI-NEXT:    s_lshr_b32 s66, s19, 24
-; SI-NEXT:    s_lshr_b32 s67, s19, 16
-; SI-NEXT:    s_lshr_b32 s68, s19, 8
-; SI-NEXT:    s_lshr_b32 s69, s17, 24
-; SI-NEXT:    s_lshr_b32 s70, s17, 16
-; SI-NEXT:    s_lshr_b32 s71, s17, 8
+; SI-NEXT:    s_lshr_b32 s46, s5, 24
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_writelane_b32 v22, s46, 42
+; SI-NEXT:    s_lshr_b32 s46, s5, 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 43
+; SI-NEXT:    s_lshr_b32 s46, s5, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 44
+; SI-NEXT:    s_lshr_b32 s46, s7, 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 45
+; SI-NEXT:    s_lshr_b32 s46, s7, 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 46
+; SI-NEXT:    s_lshr_b32 s46, s7, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 47
+; SI-NEXT:    s_lshr_b32 s46, s9, 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 48
+; SI-NEXT:    s_lshr_b32 s46, s9, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 49
+; SI-NEXT:    s_lshr_b32 s46, s11, 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 50
+; SI-NEXT:    s_lshr_b32 s46, s11, 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 51
+; SI-NEXT:    s_lshr_b32 s46, s11, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 52
+; SI-NEXT:    s_lshr_b32 s46, s13, 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 53
+; SI-NEXT:    s_lshr_b32 s46, s13, 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 54
+; SI-NEXT:    s_lshr_b32 s46, s13, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 55
+; SI-NEXT:    s_lshr_b32 s46, s15, 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 56
+; SI-NEXT:    s_lshr_b32 s46, s15, 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 57
+; SI-NEXT:    s_lshr_b32 s46, s15, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 58
+; SI-NEXT:    s_lshr_b32 s46, s41, 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 59
+; SI-NEXT:    s_lshr_b32 s46, s41, 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 60
+; SI-NEXT:    s_lshr_b32 s46, s41, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 61
+; SI-NEXT:    s_lshr_b32 s46, s43, 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 62
+; SI-NEXT:    s_lshr_b32 s46, s43, 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 63
+; SI-NEXT:    s_lshr_b32 s46, s43, 8
+; SI-NEXT:    v_writelane_b32 v21, s46, 0
+; SI-NEXT:    s_lshr_b32 s46, s45, 24
+; SI-NEXT:    v_writelane_b32 v21, s46, 1
+; SI-NEXT:    s_lshr_b32 s46, s45, 16
+; SI-NEXT:    v_writelane_b32 v21, s46, 2
+; SI-NEXT:    s_lshr_b32 s46, s45, 8
+; SI-NEXT:    v_writelane_b32 v21, s46, 3
+; SI-NEXT:    s_lshr_b32 s46, s29, 24
+; SI-NEXT:    v_writelane_b32 v21, s46, 4
+; SI-NEXT:    s_lshr_b32 s46, s29, 16
+; SI-NEXT:    v_writelane_b32 v21, s46, 5
+; SI-NEXT:    s_lshr_b32 s46, s29, 8
+; SI-NEXT:    v_writelane_b32 v21, s46, 6
+; SI-NEXT:    s_lshr_b32 s46, s27, 24
+; SI-NEXT:    v_writelane_b32 v21, s46, 7
+; SI-NEXT:    s_lshr_b32 s46, s27, 16
+; SI-NEXT:    v_writelane_b32 v21, s46, 8
+; SI-NEXT:    s_lshr_b32 s46, s27, 8
+; SI-NEXT:    v_writelane_b32 v21, s46, 9
+; SI-NEXT:    s_lshr_b32 s46, s25, 24
+; SI-NEXT:    v_writelane_b32 v21, s46, 10
+; SI-NEXT:    s_lshr_b32 s46, s25, 16
+; SI-NEXT:    v_writelane_b32 v21, s46, 11
+; SI-NEXT:    s_lshr_b32 s46, s25, 8
+; SI-NEXT:    v_writelane_b32 v21, s46, 12
+; SI-NEXT:    s_lshr_b32 s46, s23, 24
+; SI-NEXT:    v_writelane_b32 v21, s46, 13
+; SI-NEXT:    s_lshr_b32 s46, s23, 16
+; SI-NEXT:    v_writelane_b32 v21, s46, 14
+; SI-NEXT:    s_lshr_b32 s46, s23, 8
+; SI-NEXT:    v_writelane_b32 v21, s46, 15
+; SI-NEXT:    s_lshr_b32 s46, s21, 24
+; SI-NEXT:    v_writelane_b32 v21, s46, 16
+; SI-NEXT:    s_lshr_b32 s46, s21, 16
+; SI-NEXT:    v_writelane_b32 v21, s46, 17
+; SI-NEXT:    s_lshr_b64 s[46:47], s[4:5], 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 40
+; SI-NEXT:    v_writelane_b32 v22, s47, 41
+; SI-NEXT:    s_lshr_b64 s[46:47], s[4:5], 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 38
+; SI-NEXT:    v_writelane_b32 v22, s47, 39
+; SI-NEXT:    s_lshr_b64 s[46:47], s[4:5], 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 36
+; SI-NEXT:    v_writelane_b32 v22, s47, 37
+; SI-NEXT:    s_lshr_b64 s[46:47], s[6:7], 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 34
+; SI-NEXT:    v_writelane_b32 v22, s47, 35
+; SI-NEXT:    s_lshr_b64 s[46:47], s[6:7], 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 32
+; SI-NEXT:    v_writelane_b32 v22, s47, 33
+; SI-NEXT:    s_lshr_b64 s[46:47], s[6:7], 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 30
+; SI-NEXT:    v_writelane_b32 v22, s47, 31
+; SI-NEXT:    s_lshr_b64 s[46:47], s[8:9], 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 28
+; SI-NEXT:    v_writelane_b32 v22, s47, 29
+; SI-NEXT:    s_lshr_b64 s[46:47], s[8:9], 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 26
+; SI-NEXT:    v_writelane_b32 v22, s47, 27
+; SI-NEXT:    s_lshr_b64 s[46:47], s[8:9], 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 24
+; SI-NEXT:    v_writelane_b32 v22, s47, 25
+; SI-NEXT:    s_lshr_b64 s[46:47], s[10:11], 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 22
+; SI-NEXT:    v_writelane_b32 v22, s47, 23
+; SI-NEXT:    s_lshr_b64 s[46:47], s[10:11], 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 20
+; SI-NEXT:    v_writelane_b32 v22, s47, 21
+; SI-NEXT:    s_lshr_b64 s[46:47], s[10:11], 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 18
+; SI-NEXT:    v_writelane_b32 v22, s47, 19
+; SI-NEXT:    s_lshr_b64 s[46:47], s[12:13], 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 16
+; SI-NEXT:    v_writelane_b32 v22, s47, 17
+; SI-NEXT:    s_lshr_b64 s[46:47], s[12:13], 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 14
+; SI-NEXT:    v_writelane_b32 v22, s47, 15
+; SI-NEXT:    s_lshr_b64 s[46:47], s[12:13], 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 12
+; SI-NEXT:    v_writelane_b32 v22, s47, 13
+; SI-NEXT:    s_lshr_b64 s[46:47], s[14:15], 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 10
+; SI-NEXT:    v_writelane_b32 v22, s47, 11
+; SI-NEXT:    s_lshr_b64 s[46:47], s[14:15], 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 8
+; SI-NEXT:    v_writelane_b32 v22, s47, 9
+; SI-NEXT:    s_lshr_b64 s[46:47], s[14:15], 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 6
+; SI-NEXT:    v_writelane_b32 v22, s47, 7
+; SI-NEXT:    s_lshr_b64 s[46:47], s[40:41], 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 4
+; SI-NEXT:    v_writelane_b32 v22, s47, 5
+; SI-NEXT:    s_lshr_b64 s[46:47], s[40:41], 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 2
+; SI-NEXT:    v_writelane_b32 v22, s47, 3
+; SI-NEXT:    s_lshr_b64 s[46:47], s[40:41], 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 0
+; SI-NEXT:    s_lshr_b32 s50, s9, 24
+; SI-NEXT:    s_lshr_b32 s51, s21, 8
+; SI-NEXT:    s_lshr_b32 s48, s19, 24
+; SI-NEXT:    s_lshr_b32 s52, s19, 16
+; SI-NEXT:    s_lshr_b32 s53, s19, 8
+; SI-NEXT:    s_lshr_b32 s54, s17, 24
+; SI-NEXT:    s_lshr_b32 s55, s17, 16
+; SI-NEXT:    s_lshr_b32 s49, s17, 8
+; SI-NEXT:    v_writelane_b32 v22, s47, 1
+; SI-NEXT:    s_lshr_b64 s[64:65], s[42:43], 24
+; SI-NEXT:    s_lshr_b64 s[66:67], s[42:43], 16
+; SI-NEXT:    s_lshr_b64 s[68:69], s[42:43], 8
+; SI-NEXT:    s_lshr_b64 s[70:71], s[44:45], 24
+; SI-NEXT:    s_lshr_b64 s[80:81], s[44:45], 16
+; SI-NEXT:    s_lshr_b64 s[82:83], s[44:45], 8
+; SI-NEXT:    s_lshr_b64 s[84:85], s[28:29], 24
+; SI-NEXT:    s_lshr_b64 s[86:87], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[96:97], s[28:29], 8
+; SI-NEXT:    s_lshr_b64 s[98:99], s[26:27], 24
+; SI-NEXT:    s_lshr_b64 s[46:47], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[26:27], 8
+; SI-NEXT:    s_lshr_b64 s[58:59], s[24:25], 24
+; SI-NEXT:    s_lshr_b64 s[60:61], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[24:25], 8
+; SI-NEXT:    s_lshr_b64 s[72:73], s[22:23], 24
+; SI-NEXT:    s_lshr_b64 s[74:75], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[22:23], 8
+; SI-NEXT:    s_lshr_b64 s[78:79], s[20:21], 24
+; SI-NEXT:    s_lshr_b64 s[88:89], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[90:91], s[20:21], 8
+; SI-NEXT:    s_lshr_b64 s[92:93], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[94:95], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[30:31], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[34:35], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[36:37], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[38:39], s[16:17], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB13_3
 ; SI-NEXT:  .LBB13_2: ; %cmp.true
+; SI-NEXT:    s_add_i32 s5, s5, 3
+; SI-NEXT:    s_add_i32 s4, s4, 3
+; SI-NEXT:    s_lshr_b64 s[46:47], s[4:5], 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 40
+; SI-NEXT:    v_writelane_b32 v22, s47, 41
+; SI-NEXT:    s_lshr_b64 s[46:47], s[4:5], 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 38
+; SI-NEXT:    v_writelane_b32 v22, s47, 39
+; SI-NEXT:    s_lshr_b32 s46, s5, 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 42
+; SI-NEXT:    s_lshr_b32 s46, s5, 16
+; SI-NEXT:    s_add_i32 s7, s7, 3
+; SI-NEXT:    v_writelane_b32 v22, s46, 43
+; SI-NEXT:    s_lshr_b32 s46, s5, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 44
+; SI-NEXT:    s_lshr_b32 s46, s7, 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 45
+; SI-NEXT:    s_lshr_b32 s46, s7, 16
+; SI-NEXT:    s_add_i32 s9, s9, 3
+; SI-NEXT:    v_writelane_b32 v22, s46, 46
+; SI-NEXT:    s_lshr_b32 s46, s7, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 47
+; SI-NEXT:    s_lshr_b32 s46, s9, 16
+; SI-NEXT:    s_add_i32 s11, s11, 3
+; SI-NEXT:    v_writelane_b32 v22, s46, 48
+; SI-NEXT:    s_lshr_b32 s46, s9, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 49
+; SI-NEXT:    s_lshr_b32 s46, s11, 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 50
+; SI-NEXT:    s_lshr_b32 s46, s11, 16
+; SI-NEXT:    s_add_i32 s13, s13, 3
+; SI-NEXT:    v_writelane_b32 v22, s46, 51
+; SI-NEXT:    s_lshr_b32 s46, s11, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 52
+; SI-NEXT:    s_lshr_b32 s46, s13, 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 53
+; SI-NEXT:    s_lshr_b32 s46, s13, 16
+; SI-NEXT:    s_add_i32 s15, s15, 3
+; SI-NEXT:    v_writelane_b32 v22, s46, 54
+; SI-NEXT:    s_lshr_b32 s46, s13, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 55
+; SI-NEXT:    s_lshr_b32 s46, s15, 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 56
+; SI-NEXT:    s_lshr_b32 s46, s15, 16
+; SI-NEXT:    s_add_i32 s41, s41, 3
+; SI-NEXT:    v_writelane_b32 v22, s46, 57
+; SI-NEXT:    s_lshr_b32 s46, s15, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 58
+; SI-NEXT:    s_lshr_b32 s46, s41, 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 59
+; SI-NEXT:    s_lshr_b32 s46, s41, 16
+; SI-NEXT:    s_add_i32 s43, s43, 3
+; SI-NEXT:    v_writelane_b32 v22, s46, 60
+; SI-NEXT:    s_lshr_b32 s46, s41, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 61
+; SI-NEXT:    s_lshr_b32 s46, s43, 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 62
+; SI-NEXT:    s_lshr_b32 s46, s43, 16
 ; SI-NEXT:    s_add_i32 s45, s45, 3
+; SI-NEXT:    v_writelane_b32 v22, s46, 63
+; SI-NEXT:    s_lshr_b32 s46, s43, 8
+; SI-NEXT:    v_writelane_b32 v21, s46, 0
+; SI-NEXT:    s_lshr_b32 s46, s45, 24
+; SI-NEXT:    v_writelane_b32 v21, s46, 1
+; SI-NEXT:    s_lshr_b32 s46, s45, 16
+; SI-NEXT:    s_add_i32 s29, s29, 3
+; SI-NEXT:    v_writelane_b32 v21, s46, 2
+; SI-NEXT:    s_lshr_b32 s46, s45, 8
+; SI-NEXT:    v_writelane_b32 v21, s46, 3
+; SI-NEXT:    s_lshr_b32 s46, s29, 24
+; SI-NEXT:    v_writelane_b32 v21, s46, 4
+; SI-NEXT:    s_lshr_b32 s46, s29, 16
+; SI-NEXT:    s_add_i32 s27, s27, 3
+; SI-NEXT:    v_writelane_b32 v21, s46, 5
+; SI-NEXT:    s_lshr_b32 s46, s29, 8
+; SI-NEXT:    v_writelane_b32 v21, s46, 6
+; SI-NEXT:    s_lshr_b32 s46, s27, 24
+; SI-NEXT:    v_writelane_b32 v21, s46, 7
+; SI-NEXT:    s_lshr_b32 s46, s27, 16
+; SI-NEXT:    s_add_i32 s25, s25, 3
+; SI-NEXT:    v_writelane_b32 v21, s46, 8
+; SI-NEXT:    s_lshr_b32 s46, s27, 8
+; SI-NEXT:    v_writelane_b32 v21, s46, 9
+; SI-NEXT:    s_lshr_b32 s46, s25, 24
+; SI-NEXT:    v_writelane_b32 v21, s46, 10
+; SI-NEXT:    s_lshr_b32 s46, s25, 16
+; SI-NEXT:    s_add_i32 s23, s23, 3
+; SI-NEXT:    v_writelane_b32 v21, s46, 11
+; SI-NEXT:    s_lshr_b32 s46, s25, 8
+; SI-NEXT:    v_writelane_b32 v21, s46, 12
+; SI-NEXT:    s_lshr_b32 s46, s23, 24
+; SI-NEXT:    v_writelane_b32 v21, s46, 13
+; SI-NEXT:    s_lshr_b32 s46, s23, 16
+; SI-NEXT:    s_add_i32 s21, s21, 3
+; SI-NEXT:    v_writelane_b32 v21, s46, 14
+; SI-NEXT:    s_lshr_b32 s46, s23, 8
+; SI-NEXT:    v_writelane_b32 v21, s46, 15
+; SI-NEXT:    s_lshr_b32 s46, s21, 24
+; SI-NEXT:    v_writelane_b32 v21, s46, 16
+; SI-NEXT:    s_lshr_b32 s46, s21, 16
+; SI-NEXT:    v_writelane_b32 v21, s46, 17
+; SI-NEXT:    s_lshr_b64 s[46:47], s[4:5], 8
+; SI-NEXT:    s_add_i32 s6, s6, 3
+; SI-NEXT:    v_writelane_b32 v22, s46, 36
+; SI-NEXT:    v_writelane_b32 v22, s47, 37
+; SI-NEXT:    s_lshr_b64 s[46:47], s[6:7], 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 34
+; SI-NEXT:    v_writelane_b32 v22, s47, 35
+; SI-NEXT:    s_lshr_b64 s[46:47], s[6:7], 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 32
+; SI-NEXT:    v_writelane_b32 v22, s47, 33
+; SI-NEXT:    s_lshr_b64 s[46:47], s[6:7], 8
+; SI-NEXT:    s_add_i32 s8, s8, 3
+; SI-NEXT:    v_writelane_b32 v22, s46, 30
+; SI-NEXT:    v_writelane_b32 v22, s47, 31
+; SI-NEXT:    s_lshr_b64 s[46:47], s[8:9], 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 28
+; SI-NEXT:    v_writelane_b32 v22, s47, 29
+; SI-NEXT:    s_lshr_b64 s[46:47], s[8:9], 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 26
+; SI-NEXT:    v_writelane_b32 v22, s47, 27
+; SI-NEXT:    s_lshr_b64 s[46:47], s[8:9], 8
+; SI-NEXT:    s_add_i32 s10, s10, 3
+; SI-NEXT:    v_writelane_b32 v22, s46, 24
+; SI-NEXT:    v_writelane_b32 v22, s47, 25
+; SI-NEXT:    s_lshr_b64 s[46:47], s[10:11], 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 22
+; SI-NEXT:    v_writelane_b32 v22, s47, 23
+; SI-NEXT:    s_lshr_b64 s[46:47], s[10:11], 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 20
+; SI-NEXT:    v_writelane_b32 v22, s47, 21
+; SI-NEXT:    s_lshr_b64 s[46:47], s[10:11], 8
+; SI-NEXT:    s_add_i32 s12, s12, 3
+; SI-NEXT:    v_writelane_b32 v22, s46, 18
+; SI-NEXT:    v_writelane_b32 v22, s47, 19
+; SI-NEXT:    s_lshr_b64 s[46:47], s[12:13], 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 16
+; SI-NEXT:    v_writelane_b32 v22, s47, 17
+; SI-NEXT:    s_lshr_b64 s[46:47], s[12:13], 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 14
+; SI-NEXT:    v_writelane_b32 v22, s47, 15
+; SI-NEXT:    s_lshr_b64 s[46:47], s[12:13], 8
+; SI-NEXT:    s_add_i32 s14, s14, 3
+; SI-NEXT:    v_writelane_b32 v22, s46, 12
+; SI-NEXT:    v_writelane_b32 v22, s47, 13
+; SI-NEXT:    s_lshr_b64 s[46:47], s[14:15], 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 10
+; SI-NEXT:    v_writelane_b32 v22, s47, 11
+; SI-NEXT:    s_lshr_b64 s[46:47], s[14:15], 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 8
+; SI-NEXT:    v_writelane_b32 v22, s47, 9
+; SI-NEXT:    s_lshr_b64 s[46:47], s[14:15], 8
+; SI-NEXT:    s_add_i32 s40, s40, 3
+; SI-NEXT:    v_writelane_b32 v22, s46, 6
+; SI-NEXT:    v_writelane_b32 v22, s47, 7
+; SI-NEXT:    s_lshr_b64 s[46:47], s[40:41], 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 4
+; SI-NEXT:    v_writelane_b32 v22, s47, 5
+; SI-NEXT:    s_lshr_b64 s[46:47], s[40:41], 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 2
+; SI-NEXT:    v_writelane_b32 v22, s47, 3
+; SI-NEXT:    s_lshr_b64 s[46:47], s[40:41], 8
+; SI-NEXT:    s_add_i32 s17, s17, 3
 ; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    s_add_i32 s19, s19, 3
 ; SI-NEXT:    s_add_i32 s18, s18, 3
 ; SI-NEXT:    s_add_i32 s20, s20, 3
 ; SI-NEXT:    s_add_i32 s22, s22, 3
 ; SI-NEXT:    s_add_i32 s24, s24, 3
 ; SI-NEXT:    s_add_i32 s26, s26, 3
 ; SI-NEXT:    s_add_i32 s28, s28, 3
-; SI-NEXT:    s_add_i32 s47, s47, 3
 ; SI-NEXT:    s_add_i32 s44, s44, 3
-; SI-NEXT:    s_add_i32 s43, s43, 3
-; SI-NEXT:    s_add_i32 s41, s41, 3
-; SI-NEXT:    s_add_i32 s15, s15, 3
-; SI-NEXT:    s_add_i32 s13, s13, 3
-; SI-NEXT:    s_add_i32 s11, s11, 3
-; SI-NEXT:    s_add_i32 s9, s9, 3
-; SI-NEXT:    s_add_i32 s7, s7, 3
-; SI-NEXT:    v_mov_b32_e32 v22, s45
-; SI-NEXT:    s_add_i32 s17, s17, 3
-; SI-NEXT:    s_add_i32 s19, s19, 3
-; SI-NEXT:    s_add_i32 s21, s21, 3
-; SI-NEXT:    s_add_i32 s23, s23, 3
-; SI-NEXT:    s_add_i32 s25, s25, 3
-; SI-NEXT:    s_add_i32 s27, s27, 3
-; SI-NEXT:    s_add_i32 s29, s29, 3
-; SI-NEXT:    s_add_i32 s46, s46, 3
 ; SI-NEXT:    s_add_i32 s42, s42, 3
-; SI-NEXT:    s_add_i32 s40, s40, 3
-; SI-NEXT:    s_add_i32 s14, s14, 3
-; SI-NEXT:    s_add_i32 s12, s12, 3
-; SI-NEXT:    s_add_i32 s10, s10, 3
-; SI-NEXT:    s_add_i32 s8, s8, 3
-; SI-NEXT:    s_add_i32 s6, s6, 3
-; SI-NEXT:    v_mov_b32_e32 v3, s7
-; SI-NEXT:    v_mov_b32_e32 v6, s9
-; SI-NEXT:    v_mov_b32_e32 v9, s11
-; SI-NEXT:    v_mov_b32_e32 v12, s13
-; SI-NEXT:    v_mov_b32_e32 v15, s15
-; SI-NEXT:    v_mov_b32_e32 v18, s41
-; SI-NEXT:    v_mov_b32_e32 v21, s43
-; SI-NEXT:    v_alignbit_b32 v24, s44, v22, 24
-; SI-NEXT:    v_alignbit_b32 v25, s44, v22, 16
-; SI-NEXT:    v_alignbit_b32 v26, s44, v22, 8
-; SI-NEXT:    v_mov_b32_e32 v22, s47
-; SI-NEXT:    v_mov_b32_e32 v23, s28
-; SI-NEXT:    v_mov_b32_e32 v29, s26
-; SI-NEXT:    v_mov_b32_e32 v35, s24
-; SI-NEXT:    v_mov_b32_e32 v39, s22
-; SI-NEXT:    v_mov_b32_e32 v50, s20
-; SI-NEXT:    v_mov_b32_e32 v53, s18
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v40, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v3, 24
-; SI-NEXT:    v_alignbit_b32 v2, s6, v3, 16
-; SI-NEXT:    v_alignbit_b32 v3, s6, v3, 8
-; SI-NEXT:    v_alignbit_b32 v4, s8, v6, 24
-; SI-NEXT:    v_alignbit_b32 v5, s8, v6, 16
-; SI-NEXT:    v_alignbit_b32 v6, s8, v6, 8
-; SI-NEXT:    v_alignbit_b32 v7, s10, v9, 24
-; SI-NEXT:    v_alignbit_b32 v8, s10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v9, s10, v9, 8
-; SI-NEXT:    v_alignbit_b32 v10, s12, v12, 24
-; SI-NEXT:    v_alignbit_b32 v11, s12, v12, 16
-; SI-NEXT:    v_alignbit_b32 v12, s12, v12, 8
-; SI-NEXT:    v_alignbit_b32 v13, s14, v15, 24
-; SI-NEXT:    v_alignbit_b32 v14, s14, v15, 16
-; SI-NEXT:    v_alignbit_b32 v15, s14, v15, 8
-; SI-NEXT:    v_alignbit_b32 v16, s40, v18, 24
-; SI-NEXT:    v_alignbit_b32 v17, s40, v18, 16
-; SI-NEXT:    v_alignbit_b32 v18, s40, v18, 8
-; SI-NEXT:    v_alignbit_b32 v19, s42, v21, 24
-; SI-NEXT:    v_alignbit_b32 v20, s42, v21, 16
-; SI-NEXT:    v_alignbit_b32 v21, s42, v21, 8
-; SI-NEXT:    v_alignbit_b32 v30, s46, v22, 24
-; SI-NEXT:    v_alignbit_b32 v31, s46, v22, 16
-; SI-NEXT:    v_alignbit_b32 v32, s46, v22, 8
-; SI-NEXT:    v_alignbit_b32 v36, s29, v23, 24
-; SI-NEXT:    v_alignbit_b32 v22, s29, v23, 16
-; SI-NEXT:    v_alignbit_b32 v23, s29, v23, 8
-; SI-NEXT:    v_alignbit_b32 v27, s27, v29, 24
-; SI-NEXT:    v_alignbit_b32 v28, s27, v29, 16
-; SI-NEXT:    v_alignbit_b32 v29, s27, v29, 8
-; SI-NEXT:    v_alignbit_b32 v33, s25, v35, 24
-; SI-NEXT:    v_alignbit_b32 v34, s25, v35, 16
-; SI-NEXT:    v_alignbit_b32 v35, s25, v35, 8
-; SI-NEXT:    v_alignbit_b32 v37, s23, v39, 24
-; SI-NEXT:    v_alignbit_b32 v38, s23, v39, 16
-; SI-NEXT:    v_alignbit_b32 v39, s23, v39, 8
-; SI-NEXT:    v_alignbit_b32 v48, s21, v50, 24
-; SI-NEXT:    v_alignbit_b32 v49, s21, v50, 16
-; SI-NEXT:    v_alignbit_b32 v50, s21, v50, 8
-; SI-NEXT:    v_alignbit_b32 v51, s19, v53, 24
-; SI-NEXT:    v_alignbit_b32 v52, s19, v53, 16
-; SI-NEXT:    v_alignbit_b32 v53, s19, v53, 8
-; SI-NEXT:    v_alignbit_b32 v54, s17, v40, 24
-; SI-NEXT:    v_alignbit_b32 v55, s17, v40, 16
-; SI-NEXT:    v_alignbit_b32 v40, s17, v40, 8
-; SI-NEXT:    s_lshr_b32 s56, s6, 24
-; SI-NEXT:    s_lshr_b32 s57, s6, 16
-; SI-NEXT:    s_lshr_b32 s58, s6, 8
-; SI-NEXT:    s_lshr_b32 s59, s8, 24
-; SI-NEXT:    s_lshr_b32 s60, s8, 16
-; SI-NEXT:    s_lshr_b32 s61, s8, 8
-; SI-NEXT:    s_lshr_b32 s62, s10, 24
-; SI-NEXT:    s_lshr_b32 s63, s10, 16
-; SI-NEXT:    s_lshr_b32 s72, s10, 8
-; SI-NEXT:    s_lshr_b32 s73, s12, 24
-; SI-NEXT:    s_lshr_b32 s74, s12, 16
-; SI-NEXT:    s_lshr_b32 s75, s12, 8
-; SI-NEXT:    s_lshr_b32 s76, s14, 24
-; SI-NEXT:    s_lshr_b32 s77, s14, 16
-; SI-NEXT:    s_lshr_b32 s78, s14, 8
-; SI-NEXT:    s_lshr_b32 s79, s40, 24
-; SI-NEXT:    s_lshr_b32 s88, s40, 16
-; SI-NEXT:    s_lshr_b32 s89, s40, 8
-; SI-NEXT:    s_lshr_b32 s90, s42, 24
-; SI-NEXT:    s_lshr_b32 s91, s42, 16
-; SI-NEXT:    s_lshr_b32 s92, s42, 8
-; SI-NEXT:    s_lshr_b32 s93, s44, 24
-; SI-NEXT:    s_lshr_b32 s94, s44, 16
-; SI-NEXT:    s_lshr_b32 s95, s44, 8
-; SI-NEXT:    s_lshr_b32 s30, s46, 24
-; SI-NEXT:    s_lshr_b32 s31, s46, 16
-; SI-NEXT:    s_lshr_b32 s34, s46, 8
-; SI-NEXT:    s_lshr_b32 s35, s29, 24
-; SI-NEXT:    s_lshr_b32 s36, s29, 16
-; SI-NEXT:    s_lshr_b32 s37, s29, 8
-; SI-NEXT:    s_lshr_b32 s38, s27, 24
-; SI-NEXT:    s_lshr_b32 s39, s27, 16
-; SI-NEXT:    s_lshr_b32 s48, s27, 8
-; SI-NEXT:    s_lshr_b32 s49, s25, 24
-; SI-NEXT:    s_lshr_b32 s50, s25, 16
-; SI-NEXT:    s_lshr_b32 s51, s25, 8
-; SI-NEXT:    s_lshr_b32 s52, s23, 24
-; SI-NEXT:    s_lshr_b32 s53, s23, 16
-; SI-NEXT:    s_lshr_b32 s54, s23, 8
-; SI-NEXT:    s_lshr_b32 s55, s21, 24
-; SI-NEXT:    s_lshr_b32 s64, s21, 16
-; SI-NEXT:    s_lshr_b32 s65, s21, 8
-; SI-NEXT:    s_lshr_b32 s66, s19, 24
-; SI-NEXT:    s_lshr_b32 s67, s19, 16
-; SI-NEXT:    s_lshr_b32 s68, s19, 8
-; SI-NEXT:    s_lshr_b32 s69, s17, 24
-; SI-NEXT:    s_lshr_b32 s70, s17, 16
-; SI-NEXT:    s_lshr_b32 s71, s17, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 0
+; SI-NEXT:    s_lshr_b32 s50, s9, 24
+; SI-NEXT:    s_lshr_b32 s51, s21, 8
+; SI-NEXT:    s_lshr_b32 s48, s19, 24
+; SI-NEXT:    s_lshr_b32 s52, s19, 16
+; SI-NEXT:    s_lshr_b32 s53, s19, 8
+; SI-NEXT:    s_lshr_b32 s54, s17, 24
+; SI-NEXT:    s_lshr_b32 s55, s17, 16
+; SI-NEXT:    s_lshr_b32 s49, s17, 8
+; SI-NEXT:    v_writelane_b32 v22, s47, 1
+; SI-NEXT:    s_lshr_b64 s[64:65], s[42:43], 24
+; SI-NEXT:    s_lshr_b64 s[66:67], s[42:43], 16
+; SI-NEXT:    s_lshr_b64 s[68:69], s[42:43], 8
+; SI-NEXT:    s_lshr_b64 s[70:71], s[44:45], 24
+; SI-NEXT:    s_lshr_b64 s[80:81], s[44:45], 16
+; SI-NEXT:    s_lshr_b64 s[82:83], s[44:45], 8
+; SI-NEXT:    s_lshr_b64 s[84:85], s[28:29], 24
+; SI-NEXT:    s_lshr_b64 s[86:87], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[96:97], s[28:29], 8
+; SI-NEXT:    s_lshr_b64 s[98:99], s[26:27], 24
+; SI-NEXT:    s_lshr_b64 s[46:47], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[26:27], 8
+; SI-NEXT:    s_lshr_b64 s[58:59], s[24:25], 24
+; SI-NEXT:    s_lshr_b64 s[60:61], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[24:25], 8
+; SI-NEXT:    s_lshr_b64 s[72:73], s[22:23], 24
+; SI-NEXT:    s_lshr_b64 s[74:75], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[22:23], 8
+; SI-NEXT:    s_lshr_b64 s[78:79], s[20:21], 24
+; SI-NEXT:    s_lshr_b64 s[88:89], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[90:91], s[20:21], 8
+; SI-NEXT:    s_lshr_b64 s[92:93], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[94:95], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[30:31], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[34:35], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[36:37], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[38:39], s[16:17], 8
 ; SI-NEXT:  .LBB13_3: ; %end
-; SI-NEXT:    s_and_b32 s4, s16, 0xff
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 8, v40
-; SI-NEXT:    v_or_b32_e32 v40, s4, v40
-; SI-NEXT:    s_and_b32 s4, s17, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s71, 8
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s70, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s69, 24
-; SI-NEXT:    v_and_b32_e32 v55, 0xff, v55
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
-; SI-NEXT:    v_lshlrev_b32_e32 v55, 16, v55
-; SI-NEXT:    v_lshlrev_b32_e32 v54, 24, v54
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_or_b32_e32 v54, v54, v55
-; SI-NEXT:    v_mov_b32_e32 v55, s4
-; SI-NEXT:    s_and_b32 s4, s18, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v53, 8, v53
-; SI-NEXT:    v_or_b32_e32 v53, s4, v53
-; SI-NEXT:    s_and_b32 s4, s19, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s68, 8
-; SI-NEXT:    v_and_b32_e32 v52, 0xff, v52
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s67, 0xff
-; SI-NEXT:    v_and_b32_e32 v40, 0xffff, v40
-; SI-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; SI-NEXT:    v_lshlrev_b32_e32 v51, 24, v51
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s66, 24
-; SI-NEXT:    v_or_b32_e32 v54, v40, v54
-; SI-NEXT:    v_and_b32_e32 v53, 0xffff, v53
-; SI-NEXT:    v_or_b32_e32 v51, v51, v52
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
-; SI-NEXT:    buffer_store_dword v54, v0, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v54, vcc, 4, v0
-; SI-NEXT:    v_or_b32_e32 v51, v53, v51
-; SI-NEXT:    v_add_i32_e32 v52, vcc, 8, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v55, v54, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v51, v52, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v52, s4
-; SI-NEXT:    s_and_b32 s4, s20, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v50, 8, v50
-; SI-NEXT:    v_or_b32_e32 v50, s4, v50
-; SI-NEXT:    s_and_b32 s4, s21, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s65, 8
-; SI-NEXT:    v_and_b32_e32 v49, 0xff, v49
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s64, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v49, 16, v49
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 24, v48
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s55, 24
-; SI-NEXT:    v_and_b32_e32 v50, 0xffff, v50
-; SI-NEXT:    v_or_b32_e32 v48, v48, v49
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
+; SI-NEXT:    s_lshl_b32 s47, s38, 8
+; SI-NEXT:    s_and_b32 s16, s16, 0xff
+; SI-NEXT:    s_or_b32 s16, s16, s47
+; SI-NEXT:    s_and_b32 s47, s36, 0xff
+; SI-NEXT:    s_lshl_b32 s57, s34, 24
+; SI-NEXT:    s_lshl_b32 s47, s47, 16
+; SI-NEXT:    s_or_b32 s47, s57, s47
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s47
+; SI-NEXT:    v_mov_b32_e32 v1, s16
+; SI-NEXT:    s_and_b32 s16, s17, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s49, 8
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    s_and_b32 s17, s55, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_lshl_b32 s47, s54, 24
+; SI-NEXT:    s_or_b32 s17, s47, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_lshl_b32 s16, s30, 8
+; SI-NEXT:    s_and_b32 s17, s18, 0xff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_and_b32 s17, s94, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s92, 24
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v3, s16
+; SI-NEXT:    s_and_b32 s16, s19, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s53, 8
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    s_and_b32 s17, s52, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_lshl_b32 s18, s48, 24
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v4, s16
+; SI-NEXT:    s_lshl_b32 s16, s90, 8
+; SI-NEXT:    s_and_b32 s17, s20, 0xff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_and_b32 s17, s88, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s78, 24
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v5, s16
+; SI-NEXT:    s_and_b32 s16, s21, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s51, 8
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v21, 17
+; SI-NEXT:    s_and_b32 s17, s17, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v21, 16
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_lshl_b32 s18, s18, 24
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v6, s16
+; SI-NEXT:    s_lshl_b32 s16, s76, 8
+; SI-NEXT:    s_and_b32 s17, s22, 0xff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_and_b32 s17, s74, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s72, 24
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v21, 15
+; SI-NEXT:    v_mov_b32_e32 v7, s16
+; SI-NEXT:    s_and_b32 s16, s23, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v21, 14
+; SI-NEXT:    s_and_b32 s17, s17, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v21, 13
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_lshl_b32 s18, s18, 24
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v8, s16
+; SI-NEXT:    s_lshl_b32 s16, s62, 8
+; SI-NEXT:    s_and_b32 s17, s24, 0xff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_and_b32 s17, s60, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s58, 24
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v21, 12
+; SI-NEXT:    v_mov_b32_e32 v9, s16
+; SI-NEXT:    s_and_b32 s16, s25, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v21, 11
+; SI-NEXT:    s_and_b32 s17, s17, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v21, 10
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_lshl_b32 s18, s18, 24
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v10, s16
+; SI-NEXT:    s_lshl_b32 s16, s56, 8
+; SI-NEXT:    s_and_b32 s17, s26, 0xff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_and_b32 s17, s46, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s98, 24
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v21, 9
+; SI-NEXT:    v_mov_b32_e32 v11, s16
+; SI-NEXT:    s_and_b32 s16, s27, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v21, 8
+; SI-NEXT:    s_and_b32 s17, s17, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v21, 7
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_lshl_b32 s18, s18, 24
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v12, s16
+; SI-NEXT:    s_lshl_b32 s16, s96, 8
+; SI-NEXT:    s_and_b32 s17, s28, 0xff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_and_b32 s17, s86, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s84, 24
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v21, 6
+; SI-NEXT:    v_mov_b32_e32 v13, s16
+; SI-NEXT:    s_and_b32 s16, s29, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v21, 5
+; SI-NEXT:    s_and_b32 s17, s17, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v21, 4
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_lshl_b32 s18, s18, 24
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v14, s16
+; SI-NEXT:    s_lshl_b32 s16, s82, 8
+; SI-NEXT:    s_and_b32 s17, s44, 0xff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_and_b32 s17, s80, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s70, 24
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v21, 3
+; SI-NEXT:    v_mov_b32_e32 v15, s16
+; SI-NEXT:    s_and_b32 s16, s45, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v21, 2
+; SI-NEXT:    s_and_b32 s17, s17, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v21, 1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v51, vcc, 12, v0
-; SI-NEXT:    v_or_b32_e32 v48, v50, v48
-; SI-NEXT:    v_add_i32_e32 v49, vcc, 16, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v52, v51, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v48, v49, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v49, s4
-; SI-NEXT:    s_and_b32 s4, s22, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v39, 8, v39
-; SI-NEXT:    v_or_b32_e32 v39, s4, v39
-; SI-NEXT:    s_and_b32 s4, s23, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s54, 8
-; SI-NEXT:    v_and_b32_e32 v38, 0xff, v38
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s53, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v38
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 24, v37
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s52, 24
-; SI-NEXT:    v_and_b32_e32 v39, 0xffff, v39
-; SI-NEXT:    v_or_b32_e32 v37, v37, v38
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_lshl_b32 s18, s18, 24
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; SI-NEXT:    v_mov_b32_e32 v16, s16
+; SI-NEXT:    s_lshl_b32 s16, s68, 8
+; SI-NEXT:    s_and_b32 s17, s42, 0xff
+; SI-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 20, v0
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_and_b32 s17, s66, 0xff
+; SI-NEXT:    buffer_store_dword v6, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 24, v0
+; SI-NEXT:    s_lshl_b32 s18, s64, 24
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 36, v0
+; SI-NEXT:    v_readlane_b32 s17, v21, 0
+; SI-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 40, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s43, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 44, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v22, 63
+; SI-NEXT:    buffer_store_dword v12, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 48, v0
+; SI-NEXT:    s_and_b32 s17, s17, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v22, 62
+; SI-NEXT:    buffer_store_dword v13, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 52, v0
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_lshl_b32 s18, s18, 24
+; SI-NEXT:    buffer_store_dword v14, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 56, v0
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    v_readlane_b32 s18, v22, 0
+; SI-NEXT:    buffer_store_dword v15, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 60, v0
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    v_readlane_b32 s19, v22, 1
+; SI-NEXT:    buffer_store_dword v16, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 64, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    s_lshl_b32 s17, s18, 8
+; SI-NEXT:    v_readlane_b32 s18, v22, 2
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v48, vcc, 20, v0
-; SI-NEXT:    v_or_b32_e32 v37, v39, v37
-; SI-NEXT:    v_add_i32_e32 v38, vcc, 24, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v49, v48, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v37, v38, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v38, s4
-; SI-NEXT:    s_and_b32 s4, s24, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v35, 8, v35
-; SI-NEXT:    v_or_b32_e32 v35, s4, v35
-; SI-NEXT:    s_and_b32 s4, s25, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s51, 8
-; SI-NEXT:    v_and_b32_e32 v34, 0xff, v34
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s50, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 24, v33
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s49, 24
-; SI-NEXT:    v_and_b32_e32 v35, 0xffff, v35
-; SI-NEXT:    v_or_b32_e32 v33, v33, v34
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s40, 0xff
+; SI-NEXT:    v_readlane_b32 s19, v22, 3
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    s_and_b32 s17, s18, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v22, 4
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_lshl_b32 s18, s18, 24
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x44, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v22, 61
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v37, vcc, 28, v0
-; SI-NEXT:    v_or_b32_e32 v33, v35, v33
-; SI-NEXT:    v_add_i32_e32 v34, vcc, 32, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v38, v37, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v33, v34, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v34, s4
-; SI-NEXT:    s_and_b32 s4, s26, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 8, v29
-; SI-NEXT:    v_or_b32_e32 v29, s4, v29
-; SI-NEXT:    s_and_b32 s4, s27, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s48, 8
-; SI-NEXT:    v_and_b32_e32 v28, 0xff, v28
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s39, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 24, v27
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s38, 24
-; SI-NEXT:    v_and_b32_e32 v29, 0xffff, v29
-; SI-NEXT:    v_or_b32_e32 v27, v27, v28
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s41, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v22, 60
+; SI-NEXT:    s_and_b32 s17, s17, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v22, 59
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_lshl_b32 s18, s18, 24
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x48, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v33, vcc, 36, v0
-; SI-NEXT:    v_or_b32_e32 v27, v29, v27
-; SI-NEXT:    v_add_i32_e32 v28, vcc, 40, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v34, v33, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v27, v28, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v28, s4
-; SI-NEXT:    s_and_b32 s4, s28, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 8, v23
-; SI-NEXT:    v_or_b32_e32 v23, s4, v23
-; SI-NEXT:    s_and_b32 s4, s29, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s37, 8
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    v_readlane_b32 s16, v22, 6
+; SI-NEXT:    s_and_b32 s14, s14, 0xff
+; SI-NEXT:    v_readlane_b32 s17, v22, 7
+; SI-NEXT:    s_lshl_b32 s16, s16, 8
+; SI-NEXT:    v_readlane_b32 s19, v22, 5
+; SI-NEXT:    s_or_b32 s14, s14, s16
+; SI-NEXT:    v_readlane_b32 s16, v22, 8
+; SI-NEXT:    v_readlane_b32 s17, v22, 9
+; SI-NEXT:    s_and_b32 s16, s16, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v22, 10
+; SI-NEXT:    s_lshl_b32 s16, s16, 16
+; SI-NEXT:    s_lshl_b32 s17, s18, 24
+; SI-NEXT:    s_and_b32 s14, s14, 0xffff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x4c, v0
+; SI-NEXT:    s_or_b32 s14, s14, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v27, vcc, 44, v0
-; SI-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s36, 0xff
-; SI-NEXT:    buffer_store_dword v28, v27, s[0:3], 0 offen
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 24, v36
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s35, 24
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; SI-NEXT:    v_or_b32_e32 v22, v27, v22
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
-; SI-NEXT:    v_or_b32_e32 v22, v23, v22
-; SI-NEXT:    v_add_i32_e32 v23, vcc, 48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v22, v23, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s14
+; SI-NEXT:    s_and_b32 s14, s15, 0xff
+; SI-NEXT:    v_readlane_b32 s15, v22, 58
+; SI-NEXT:    s_lshl_b32 s15, s15, 8
+; SI-NEXT:    s_or_b32 s14, s14, s15
+; SI-NEXT:    v_readlane_b32 s15, v22, 57
+; SI-NEXT:    s_and_b32 s15, s15, 0xff
+; SI-NEXT:    v_readlane_b32 s16, v22, 56
+; SI-NEXT:    s_lshl_b32 s15, s15, 16
+; SI-NEXT:    s_lshl_b32 s16, s16, 24
+; SI-NEXT:    s_and_b32 s14, s14, 0xffff
+; SI-NEXT:    s_or_b32 s15, s16, s15
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x50, v0
+; SI-NEXT:    s_or_b32 s14, s14, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v22, vcc, 52, v0
-; SI-NEXT:    v_mov_b32_e32 v23, s4
-; SI-NEXT:    buffer_store_dword v23, v22, s[0:3], 0 offen
-; SI-NEXT:    s_and_b32 s4, s47, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 8, v32
-; SI-NEXT:    v_or_b32_e32 v22, s4, v22
-; SI-NEXT:    s_and_b32 s4, s46, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s34, 8
+; SI-NEXT:    v_mov_b32_e32 v2, s14
+; SI-NEXT:    v_readlane_b32 s14, v22, 12
+; SI-NEXT:    s_and_b32 s12, s12, 0xff
+; SI-NEXT:    v_readlane_b32 s15, v22, 13
+; SI-NEXT:    s_lshl_b32 s14, s14, 8
+; SI-NEXT:    s_or_b32 s12, s12, s14
+; SI-NEXT:    v_readlane_b32 s14, v22, 14
+; SI-NEXT:    v_readlane_b32 s15, v22, 15
+; SI-NEXT:    s_and_b32 s14, s14, 0xff
+; SI-NEXT:    v_readlane_b32 s16, v22, 16
+; SI-NEXT:    s_lshl_b32 s14, s14, 16
+; SI-NEXT:    s_lshl_b32 s15, s16, 24
+; SI-NEXT:    s_and_b32 s12, s12, 0xffff
+; SI-NEXT:    s_or_b32 s14, s15, s14
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x54, v0
+; SI-NEXT:    s_or_b32 s12, s12, s14
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v23, 0xff, v31
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s31, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 24, v30
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s30, 24
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; SI-NEXT:    v_or_b32_e32 v23, v27, v23
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
-; SI-NEXT:    v_or_b32_e32 v22, v22, v23
-; SI-NEXT:    v_add_i32_e32 v23, vcc, 56, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v22, v23, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    s_and_b32 s12, s13, 0xff
+; SI-NEXT:    v_readlane_b32 s13, v22, 55
+; SI-NEXT:    s_lshl_b32 s13, s13, 8
+; SI-NEXT:    s_or_b32 s12, s12, s13
+; SI-NEXT:    v_readlane_b32 s13, v22, 54
+; SI-NEXT:    s_and_b32 s13, s13, 0xff
+; SI-NEXT:    v_readlane_b32 s14, v22, 53
+; SI-NEXT:    s_lshl_b32 s13, s13, 16
+; SI-NEXT:    s_lshl_b32 s14, s14, 24
+; SI-NEXT:    s_and_b32 s12, s12, 0xffff
+; SI-NEXT:    s_or_b32 s13, s14, s13
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x58, v0
+; SI-NEXT:    s_or_b32 s12, s12, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v22, vcc, 60, v0
-; SI-NEXT:    v_mov_b32_e32 v23, s4
-; SI-NEXT:    buffer_store_dword v23, v22, s[0:3], 0 offen
-; SI-NEXT:    s_and_b32 s4, s45, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 8, v26
-; SI-NEXT:    v_or_b32_e32 v22, s4, v22
-; SI-NEXT:    s_and_b32 s4, s44, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s95, 8
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    v_readlane_b32 s12, v22, 18
+; SI-NEXT:    s_and_b32 s10, s10, 0xff
+; SI-NEXT:    v_readlane_b32 s13, v22, 19
+; SI-NEXT:    s_lshl_b32 s12, s12, 8
+; SI-NEXT:    s_or_b32 s10, s10, s12
+; SI-NEXT:    v_readlane_b32 s12, v22, 20
+; SI-NEXT:    v_readlane_b32 s13, v22, 21
+; SI-NEXT:    s_and_b32 s12, s12, 0xff
+; SI-NEXT:    v_readlane_b32 s14, v22, 22
+; SI-NEXT:    s_lshl_b32 s12, s12, 16
+; SI-NEXT:    s_lshl_b32 s13, s14, 24
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_or_b32 s12, s13, s12
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x5c, v0
+; SI-NEXT:    s_or_b32 s10, s10, s12
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v23, 0xff, v25
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s94, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 24, v24
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s93, 24
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; SI-NEXT:    v_or_b32_e32 v23, v24, v23
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
-; SI-NEXT:    v_or_b32_e32 v22, v22, v23
-; SI-NEXT:    v_add_i32_e32 v23, vcc, 64, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v22, v23, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v23, s4
-; SI-NEXT:    s_and_b32 s4, s43, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 8, v21
-; SI-NEXT:    v_or_b32_e32 v21, s4, v21
-; SI-NEXT:    s_and_b32 s4, s42, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s92, 8
-; SI-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s91, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 24, v19
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s90, 24
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s10, s11, 0xff
+; SI-NEXT:    v_readlane_b32 s11, v22, 52
+; SI-NEXT:    s_lshl_b32 s11, s11, 8
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    v_readlane_b32 s11, v22, 51
+; SI-NEXT:    s_and_b32 s11, s11, 0xff
+; SI-NEXT:    v_readlane_b32 s12, v22, 50
+; SI-NEXT:    s_lshl_b32 s11, s11, 16
+; SI-NEXT:    s_lshl_b32 s12, s12, 24
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_or_b32 s11, s12, s11
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x60, v0
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v22, vcc, 0x44, v0
-; SI-NEXT:    v_or_b32_e32 v19, v21, v19
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 0x48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v23, v22, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v20, s4
-; SI-NEXT:    s_and_b32 s4, s41, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
-; SI-NEXT:    v_or_b32_e32 v18, s4, v18
-; SI-NEXT:    s_and_b32 s4, s40, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s89, 8
-; SI-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s88, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 24, v16
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s79, 24
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; SI-NEXT:    v_or_b32_e32 v16, v16, v17
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    v_readlane_b32 s10, v22, 24
+; SI-NEXT:    s_and_b32 s8, s8, 0xff
+; SI-NEXT:    v_readlane_b32 s11, v22, 25
+; SI-NEXT:    s_lshl_b32 s10, s10, 8
+; SI-NEXT:    s_or_b32 s8, s8, s10
+; SI-NEXT:    v_readlane_b32 s10, v22, 26
+; SI-NEXT:    v_readlane_b32 s11, v22, 27
+; SI-NEXT:    s_and_b32 s10, s10, 0xff
+; SI-NEXT:    v_readlane_b32 s12, v22, 28
+; SI-NEXT:    s_lshl_b32 s10, s10, 16
+; SI-NEXT:    s_lshl_b32 s11, s12, 24
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    s_or_b32 s10, s11, s10
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x64, v0
+; SI-NEXT:    s_or_b32 s8, s8, s10
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 0x4c, v0
-; SI-NEXT:    v_or_b32_e32 v16, v18, v16
-; SI-NEXT:    v_add_i32_e32 v17, vcc, 0x50, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v20, v19, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v17, s4
-; SI-NEXT:    s_and_b32 s4, s15, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
-; SI-NEXT:    v_or_b32_e32 v15, s4, v15
-; SI-NEXT:    s_and_b32 s4, s14, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s78, 8
-; SI-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s77, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s14, s76, 24
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; SI-NEXT:    v_or_b32_e32 v13, v13, v14
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s14, s5
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s8, s9, 0xff
+; SI-NEXT:    v_readlane_b32 s9, v22, 49
+; SI-NEXT:    s_lshl_b32 s9, s9, 8
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    v_readlane_b32 s9, v22, 48
+; SI-NEXT:    s_and_b32 s9, s9, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s10, s50, 24
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    s_or_b32 s9, s10, s9
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x68, v0
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v16, vcc, 0x54, v0
-; SI-NEXT:    v_or_b32_e32 v13, v15, v13
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 0x58, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v17, v16, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v14, s4
-; SI-NEXT:    s_and_b32 s4, s13, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
-; SI-NEXT:    v_or_b32_e32 v12, s4, v12
-; SI-NEXT:    s_and_b32 s4, s12, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s75, 8
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s74, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s12, s73, 24
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; SI-NEXT:    v_or_b32_e32 v10, v10, v11
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s12, s5
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_readlane_b32 s8, v22, 30
+; SI-NEXT:    s_and_b32 s6, s6, 0xff
+; SI-NEXT:    v_readlane_b32 s9, v22, 31
+; SI-NEXT:    s_lshl_b32 s8, s8, 8
+; SI-NEXT:    s_or_b32 s6, s6, s8
+; SI-NEXT:    v_readlane_b32 s8, v22, 32
+; SI-NEXT:    v_readlane_b32 s9, v22, 33
+; SI-NEXT:    s_and_b32 s8, s8, 0xff
+; SI-NEXT:    v_readlane_b32 s10, v22, 34
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_lshl_b32 s9, s10, 24
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_or_b32 s8, s9, s8
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x6c, v0
+; SI-NEXT:    s_or_b32 s6, s6, s8
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 0x5c, v0
-; SI-NEXT:    v_or_b32_e32 v10, v12, v10
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 0x60, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v14, v13, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v11, s4
-; SI-NEXT:    s_and_b32 s4, s11, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; SI-NEXT:    v_or_b32_e32 v9, s4, v9
-; SI-NEXT:    s_and_b32 s4, s10, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s72, 8
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s63, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s10, s62, 24
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s10, s5
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s6, s7, 0xff
+; SI-NEXT:    v_readlane_b32 s7, v22, 47
+; SI-NEXT:    s_lshl_b32 s7, s7, 8
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    v_readlane_b32 s7, v22, 46
+; SI-NEXT:    s_and_b32 s7, s7, 0xff
+; SI-NEXT:    v_readlane_b32 s8, v22, 45
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_lshl_b32 s8, s8, 24
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_or_b32 s7, s8, s7
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x70, v0
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 0x64, v0
-; SI-NEXT:    v_or_b32_e32 v7, v9, v7
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x68, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v11, v10, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v8, s4
-; SI-NEXT:    s_and_b32 s4, s9, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
-; SI-NEXT:    v_or_b32_e32 v6, s4, v6
-; SI-NEXT:    s_and_b32 s4, s8, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s61, 8
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s60, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s8, s59, 24
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; SI-NEXT:    v_or_b32_e32 v4, v4, v5
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    v_readlane_b32 s6, v22, 36
+; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    v_readlane_b32 s7, v22, 37
+; SI-NEXT:    s_lshl_b32 s6, s6, 8
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    v_readlane_b32 s6, v22, 38
+; SI-NEXT:    v_readlane_b32 s7, v22, 39
+; SI-NEXT:    s_and_b32 s6, s6, 0xff
+; SI-NEXT:    v_readlane_b32 s8, v22, 40
+; SI-NEXT:    s_lshl_b32 s6, s6, 16
+; SI-NEXT:    s_lshl_b32 s7, s8, 24
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s8, s5
+; SI-NEXT:    s_or_b32 s6, s7, s6
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x74, v0
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x6c, v0
-; SI-NEXT:    v_or_b32_e32 v4, v6, v4
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x70, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v8, v7, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v5, s4
-; SI-NEXT:    s_and_b32 s4, s7, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s6, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s58, 8
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    s_and_b32 s4, s5, 0xff
+; SI-NEXT:    v_readlane_b32 s5, v22, 44
+; SI-NEXT:    s_lshl_b32 s5, s5, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s57, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; SI-NEXT:    v_readlane_b32 s5, v22, 43
+; SI-NEXT:    s_and_b32 s5, s5, 0xff
+; SI-NEXT:    v_readlane_b32 s6, v22, 42
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s6, s56, 24
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    s_lshl_b32 s6, s6, 24
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s6, s5
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x74, v0
-; SI-NEXT:    v_or_b32_e32 v1, v3, v1
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x78, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    v_readlane_b32 s19, v22, 11
+; SI-NEXT:    v_readlane_b32 s17, v22, 17
+; SI-NEXT:    v_readlane_b32 s15, v22, 23
+; SI-NEXT:    v_readlane_b32 s13, v22, 29
+; SI-NEXT:    v_readlane_b32 s11, v22, 35
+; SI-NEXT:    v_readlane_b32 s9, v22, 41
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    v_readlane_b32 s71, v41, 23
-; SI-NEXT:    v_readlane_b32 s70, v41, 22
-; SI-NEXT:    v_readlane_b32 s69, v41, 21
-; SI-NEXT:    v_readlane_b32 s68, v41, 20
-; SI-NEXT:    v_readlane_b32 s67, v41, 19
-; SI-NEXT:    v_readlane_b32 s66, v41, 18
-; SI-NEXT:    v_readlane_b32 s65, v41, 17
-; SI-NEXT:    v_readlane_b32 s64, v41, 16
-; SI-NEXT:    v_readlane_b32 s55, v41, 15
-; SI-NEXT:    v_readlane_b32 s54, v41, 14
-; SI-NEXT:    v_readlane_b32 s53, v41, 13
-; SI-NEXT:    v_readlane_b32 s52, v41, 12
-; SI-NEXT:    v_readlane_b32 s51, v41, 11
-; SI-NEXT:    v_readlane_b32 s50, v41, 10
-; SI-NEXT:    v_readlane_b32 s49, v41, 9
-; SI-NEXT:    v_readlane_b32 s48, v41, 8
-; SI-NEXT:    v_readlane_b32 s39, v41, 7
-; SI-NEXT:    v_readlane_b32 s38, v41, 6
-; SI-NEXT:    v_readlane_b32 s37, v41, 5
-; SI-NEXT:    v_readlane_b32 s36, v41, 4
-; SI-NEXT:    v_readlane_b32 s35, v41, 3
-; SI-NEXT:    v_readlane_b32 s34, v41, 2
-; SI-NEXT:    v_readlane_b32 s31, v41, 1
-; SI-NEXT:    v_readlane_b32 s30, v41, 0
-; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    v_readlane_b32 s99, v20, 35
+; SI-NEXT:    v_readlane_b32 s98, v20, 34
+; SI-NEXT:    v_readlane_b32 s97, v20, 33
+; SI-NEXT:    v_readlane_b32 s96, v20, 32
+; SI-NEXT:    v_readlane_b32 s87, v20, 31
+; SI-NEXT:    v_readlane_b32 s86, v20, 30
+; SI-NEXT:    v_readlane_b32 s85, v20, 29
+; SI-NEXT:    v_readlane_b32 s84, v20, 28
+; SI-NEXT:    v_readlane_b32 s83, v20, 27
+; SI-NEXT:    v_readlane_b32 s82, v20, 26
+; SI-NEXT:    v_readlane_b32 s81, v20, 25
+; SI-NEXT:    v_readlane_b32 s80, v20, 24
+; SI-NEXT:    v_readlane_b32 s71, v20, 23
+; SI-NEXT:    v_readlane_b32 s70, v20, 22
+; SI-NEXT:    v_readlane_b32 s69, v20, 21
+; SI-NEXT:    v_readlane_b32 s68, v20, 20
+; SI-NEXT:    v_readlane_b32 s67, v20, 19
+; SI-NEXT:    v_readlane_b32 s66, v20, 18
+; SI-NEXT:    v_readlane_b32 s65, v20, 17
+; SI-NEXT:    v_readlane_b32 s64, v20, 16
+; SI-NEXT:    v_readlane_b32 s55, v20, 15
+; SI-NEXT:    v_readlane_b32 s54, v20, 14
+; SI-NEXT:    v_readlane_b32 s53, v20, 13
+; SI-NEXT:    v_readlane_b32 s52, v20, 12
+; SI-NEXT:    v_readlane_b32 s51, v20, 11
+; SI-NEXT:    v_readlane_b32 s50, v20, 10
+; SI-NEXT:    v_readlane_b32 s49, v20, 9
+; SI-NEXT:    v_readlane_b32 s48, v20, 8
+; SI-NEXT:    v_readlane_b32 s39, v20, 7
+; SI-NEXT:    v_readlane_b32 s38, v20, 6
+; SI-NEXT:    v_readlane_b32 s37, v20, 5
+; SI-NEXT:    v_readlane_b32 s36, v20, 4
+; SI-NEXT:    v_readlane_b32 s35, v20, 3
+; SI-NEXT:    v_readlane_b32 s34, v20, 2
+; SI-NEXT:    v_readlane_b32 s31, v20, 1
+; SI-NEXT:    v_readlane_b32 s30, v20, 0
+; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB13_4:
-; SI-NEXT:    ; implicit-def: $vgpr40
-; SI-NEXT:    ; implicit-def: $vgpr55
-; SI-NEXT:    ; implicit-def: $vgpr54
-; SI-NEXT:    ; implicit-def: $sgpr71
-; SI-NEXT:    ; implicit-def: $sgpr70
-; SI-NEXT:    ; implicit-def: $sgpr69
-; SI-NEXT:    ; implicit-def: $vgpr53
-; SI-NEXT:    ; implicit-def: $vgpr52
-; SI-NEXT:    ; implicit-def: $vgpr51
-; SI-NEXT:    ; implicit-def: $sgpr68
-; SI-NEXT:    ; implicit-def: $sgpr67
-; SI-NEXT:    ; implicit-def: $sgpr66
-; SI-NEXT:    ; implicit-def: $vgpr50
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    ; implicit-def: $sgpr65
-; SI-NEXT:    ; implicit-def: $sgpr64
+; SI-NEXT:    ; implicit-def: $sgpr51
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_writelane_b32 v22, s50, 0
+; SI-NEXT:    v_writelane_b32 v22, s51, 1
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s50, 2
+; SI-NEXT:    v_writelane_b32 v22, s51, 3
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s50, 4
+; SI-NEXT:    v_writelane_b32 v22, s51, 5
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s50, 6
+; SI-NEXT:    v_writelane_b32 v22, s51, 7
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s50, 8
+; SI-NEXT:    v_writelane_b32 v22, s51, 9
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s50, 10
+; SI-NEXT:    v_writelane_b32 v22, s51, 11
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s50, 12
+; SI-NEXT:    v_writelane_b32 v22, s51, 13
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s50, 14
+; SI-NEXT:    v_writelane_b32 v22, s51, 15
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s50, 16
+; SI-NEXT:    v_writelane_b32 v22, s51, 17
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s50, 18
+; SI-NEXT:    v_writelane_b32 v22, s51, 19
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s50, 20
+; SI-NEXT:    v_writelane_b32 v22, s51, 21
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s50, 22
+; SI-NEXT:    v_writelane_b32 v22, s51, 23
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s50, 24
+; SI-NEXT:    v_writelane_b32 v22, s51, 25
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s50, 26
+; SI-NEXT:    v_writelane_b32 v22, s51, 27
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s50, 28
+; SI-NEXT:    v_writelane_b32 v22, s51, 29
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s50, 30
+; SI-NEXT:    v_writelane_b32 v22, s51, 31
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s50, 32
+; SI-NEXT:    v_writelane_b32 v22, s51, 33
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s50, 34
+; SI-NEXT:    v_writelane_b32 v22, s51, 35
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s50, 36
+; SI-NEXT:    v_writelane_b32 v22, s51, 37
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s50, 38
+; SI-NEXT:    v_writelane_b32 v22, s51, 39
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr49
 ; SI-NEXT:    ; implicit-def: $sgpr55
-; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    ; implicit-def: $vgpr38
-; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; implicit-def: $sgpr54
 ; SI-NEXT:    ; implicit-def: $sgpr53
 ; SI-NEXT:    ; implicit-def: $sgpr52
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr34
-; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; implicit-def: $sgpr51
-; SI-NEXT:    ; implicit-def: $sgpr50
-; SI-NEXT:    ; implicit-def: $sgpr49
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr27
 ; SI-NEXT:    ; implicit-def: $sgpr48
-; SI-NEXT:    ; implicit-def: $sgpr39
+; SI-NEXT:    ; kill: killed $sgpr46
 ; SI-NEXT:    ; implicit-def: $sgpr38
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; implicit-def: $sgpr37
 ; SI-NEXT:    ; implicit-def: $sgpr36
-; SI-NEXT:    ; implicit-def: $sgpr35
 ; SI-NEXT:    ; implicit-def: $sgpr34
-; SI-NEXT:    ; implicit-def: $sgpr31
 ; SI-NEXT:    ; implicit-def: $sgpr30
-; SI-NEXT:    ; implicit-def: $sgpr95
 ; SI-NEXT:    ; implicit-def: $sgpr94
-; SI-NEXT:    ; implicit-def: $sgpr93
 ; SI-NEXT:    ; implicit-def: $sgpr92
-; SI-NEXT:    ; implicit-def: $sgpr91
 ; SI-NEXT:    ; implicit-def: $sgpr90
-; SI-NEXT:    ; implicit-def: $sgpr89
 ; SI-NEXT:    ; implicit-def: $sgpr88
-; SI-NEXT:    ; implicit-def: $sgpr79
 ; SI-NEXT:    ; implicit-def: $sgpr78
-; SI-NEXT:    ; implicit-def: $sgpr77
 ; SI-NEXT:    ; implicit-def: $sgpr76
-; SI-NEXT:    ; implicit-def: $sgpr75
 ; SI-NEXT:    ; implicit-def: $sgpr74
-; SI-NEXT:    ; implicit-def: $sgpr73
 ; SI-NEXT:    ; implicit-def: $sgpr72
-; SI-NEXT:    ; implicit-def: $sgpr63
 ; SI-NEXT:    ; implicit-def: $sgpr62
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $sgpr61
-; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $sgpr60
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $sgpr59
-; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    ; implicit-def: $sgpr58
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $sgpr57
-; SI-NEXT:    ; implicit-def: $vgpr25
 ; SI-NEXT:    ; implicit-def: $sgpr56
-; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s50, 40
+; SI-NEXT:    ; implicit-def: $sgpr98
+; SI-NEXT:    ; implicit-def: $sgpr96
+; SI-NEXT:    ; implicit-def: $sgpr86
+; SI-NEXT:    ; implicit-def: $sgpr84
+; SI-NEXT:    ; implicit-def: $sgpr82
+; SI-NEXT:    ; implicit-def: $sgpr80
+; SI-NEXT:    ; implicit-def: $sgpr70
+; SI-NEXT:    ; implicit-def: $sgpr68
+; SI-NEXT:    ; implicit-def: $sgpr66
+; SI-NEXT:    ; implicit-def: $sgpr64
+; SI-NEXT:    v_writelane_b32 v22, s51, 41
+; SI-NEXT:    ; implicit-def: $sgpr50
 ; SI-NEXT:    s_branch .LBB13_2
 ;
 ; VI-LABEL: bitcast_v32i32_to_v128i8_scalar:
@@ -34454,385 +34790,431 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3
 ; SI-LABEL: bitcast_v32i32_to_v64i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_writelane_b32 v20, s30, 0
+; SI-NEXT:    v_writelane_b32 v20, s31, 1
+; SI-NEXT:    v_writelane_b32 v20, s34, 2
+; SI-NEXT:    v_writelane_b32 v20, s35, 3
+; SI-NEXT:    v_writelane_b32 v20, s36, 4
+; SI-NEXT:    v_writelane_b32 v20, s37, 5
+; SI-NEXT:    v_writelane_b32 v20, s38, 6
+; SI-NEXT:    v_writelane_b32 v20, s39, 7
+; SI-NEXT:    v_writelane_b32 v20, s48, 8
+; SI-NEXT:    v_writelane_b32 v20, s49, 9
+; SI-NEXT:    v_writelane_b32 v20, s50, 10
+; SI-NEXT:    v_writelane_b32 v20, s51, 11
+; SI-NEXT:    v_writelane_b32 v20, s52, 12
+; SI-NEXT:    v_writelane_b32 v20, s53, 13
+; SI-NEXT:    v_writelane_b32 v20, s54, 14
+; SI-NEXT:    v_writelane_b32 v20, s55, 15
+; SI-NEXT:    v_writelane_b32 v20, s64, 16
+; SI-NEXT:    v_writelane_b32 v20, s65, 17
+; SI-NEXT:    v_writelane_b32 v20, s66, 18
+; SI-NEXT:    v_writelane_b32 v20, s67, 19
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v19
-; SI-NEXT:    v_readfirstlane_b32 s47, v1
-; SI-NEXT:    v_readfirstlane_b32 s46, v2
-; SI-NEXT:    v_readfirstlane_b32 s45, v3
-; SI-NEXT:    v_readfirstlane_b32 s44, v4
-; SI-NEXT:    v_readfirstlane_b32 s43, v5
-; SI-NEXT:    v_readfirstlane_b32 s42, v6
-; SI-NEXT:    v_readfirstlane_b32 s41, v7
-; SI-NEXT:    v_readfirstlane_b32 s40, v8
-; SI-NEXT:    v_readfirstlane_b32 s15, v9
-; SI-NEXT:    v_readfirstlane_b32 s14, v10
-; SI-NEXT:    v_readfirstlane_b32 s13, v11
-; SI-NEXT:    v_readfirstlane_b32 s12, v12
-; SI-NEXT:    v_readfirstlane_b32 s11, v13
-; SI-NEXT:    v_readfirstlane_b32 s10, v14
-; SI-NEXT:    v_readfirstlane_b32 s9, v15
-; SI-NEXT:    v_readfirstlane_b32 s8, v16
-; SI-NEXT:    v_readfirstlane_b32 s7, v17
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_readfirstlane_b32 s6, v18
+; SI-NEXT:    v_writelane_b32 v20, s68, 20
+; SI-NEXT:    v_readfirstlane_b32 s44, v1
+; SI-NEXT:    v_readfirstlane_b32 s45, v2
+; SI-NEXT:    v_readfirstlane_b32 s42, v3
+; SI-NEXT:    v_readfirstlane_b32 s43, v4
+; SI-NEXT:    v_readfirstlane_b32 s40, v5
+; SI-NEXT:    v_readfirstlane_b32 s41, v6
+; SI-NEXT:    v_readfirstlane_b32 s14, v7
+; SI-NEXT:    v_readfirstlane_b32 s15, v8
+; SI-NEXT:    v_readfirstlane_b32 s12, v9
+; SI-NEXT:    v_readfirstlane_b32 s13, v10
+; SI-NEXT:    v_readfirstlane_b32 s10, v11
+; SI-NEXT:    v_readfirstlane_b32 s11, v12
+; SI-NEXT:    v_readfirstlane_b32 s8, v13
+; SI-NEXT:    v_readfirstlane_b32 s9, v14
+; SI-NEXT:    v_readfirstlane_b32 s6, v15
+; SI-NEXT:    v_readfirstlane_b32 s7, v16
+; SI-NEXT:    v_readfirstlane_b32 s4, v17
+; SI-NEXT:    s_and_b64 s[46:47], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s5, v18
+; SI-NEXT:    v_writelane_b32 v20, s69, 21
 ; SI-NEXT:    s_cbranch_scc0 .LBB25_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s13
-; SI-NEXT:    v_mov_b32_e32 v5, s15
-; SI-NEXT:    v_mov_b32_e32 v6, s41
-; SI-NEXT:    v_mov_b32_e32 v7, s43
-; SI-NEXT:    v_mov_b32_e32 v8, s45
-; SI-NEXT:    v_mov_b32_e32 v9, s47
-; SI-NEXT:    v_mov_b32_e32 v10, s28
-; SI-NEXT:    v_mov_b32_e32 v11, s26
-; SI-NEXT:    v_mov_b32_e32 v12, s24
-; SI-NEXT:    v_mov_b32_e32 v13, s22
-; SI-NEXT:    v_mov_b32_e32 v14, s20
-; SI-NEXT:    v_mov_b32_e32 v15, s18
-; SI-NEXT:    v_mov_b32_e32 v16, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s10, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s12, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s14, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s40, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s42, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s44, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s46, v9, 16
-; SI-NEXT:    v_alignbit_b32 v10, s29, v10, 16
-; SI-NEXT:    v_alignbit_b32 v11, s27, v11, 16
-; SI-NEXT:    v_alignbit_b32 v12, s25, v12, 16
-; SI-NEXT:    v_alignbit_b32 v13, s23, v13, 16
-; SI-NEXT:    v_alignbit_b32 v14, s21, v14, 16
-; SI-NEXT:    v_alignbit_b32 v15, s19, v15, 16
-; SI-NEXT:    v_alignbit_b32 v16, s17, v16, 16
-; SI-NEXT:    s_lshr_b32 s56, s6, 16
-; SI-NEXT:    s_lshr_b32 s57, s8, 16
-; SI-NEXT:    s_lshr_b32 s58, s10, 16
-; SI-NEXT:    s_lshr_b32 s59, s12, 16
-; SI-NEXT:    s_lshr_b32 s60, s14, 16
-; SI-NEXT:    s_lshr_b32 s61, s40, 16
-; SI-NEXT:    s_lshr_b32 s62, s42, 16
-; SI-NEXT:    s_lshr_b32 s63, s44, 16
-; SI-NEXT:    s_lshr_b32 s72, s46, 16
-; SI-NEXT:    s_lshr_b32 s73, s29, 16
-; SI-NEXT:    s_lshr_b32 s74, s27, 16
-; SI-NEXT:    s_lshr_b32 s75, s25, 16
-; SI-NEXT:    s_lshr_b32 s76, s23, 16
-; SI-NEXT:    s_lshr_b32 s77, s21, 16
-; SI-NEXT:    s_lshr_b32 s78, s19, 16
-; SI-NEXT:    s_lshr_b32 s79, s17, 16
+; SI-NEXT:    s_lshr_b32 s38, s5, 16
+; SI-NEXT:    s_lshr_b32 s39, s7, 16
+; SI-NEXT:    s_lshr_b32 s48, s9, 16
+; SI-NEXT:    s_lshr_b32 s49, s11, 16
+; SI-NEXT:    s_lshr_b32 s50, s13, 16
+; SI-NEXT:    s_lshr_b32 s51, s15, 16
+; SI-NEXT:    s_lshr_b32 s52, s41, 16
+; SI-NEXT:    s_lshr_b32 s53, s43, 16
+; SI-NEXT:    s_lshr_b32 s54, s45, 16
+; SI-NEXT:    s_lshr_b32 s55, s29, 16
+; SI-NEXT:    s_lshr_b32 s64, s27, 16
+; SI-NEXT:    s_lshr_b32 s65, s25, 16
+; SI-NEXT:    s_lshr_b32 s66, s23, 16
+; SI-NEXT:    s_lshr_b32 s67, s21, 16
+; SI-NEXT:    s_lshr_b32 s68, s19, 16
+; SI-NEXT:    s_lshr_b32 s69, s17, 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[72:73], s[14:15], 16
+; SI-NEXT:    s_lshr_b64 s[74:75], s[40:41], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[42:43], 16
+; SI-NEXT:    s_lshr_b64 s[78:79], s[44:45], 16
+; SI-NEXT:    s_lshr_b64 s[88:89], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[90:91], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[92:93], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[94:95], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[30:31], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[34:35], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[36:37], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB25_3
 ; SI-NEXT:  .LBB25_2: ; %cmp.true
+; SI-NEXT:    s_add_i32 s17, s17, 3
 ; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    s_add_i32 s19, s19, 3
 ; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_add_i32 s21, s21, 3
 ; SI-NEXT:    s_add_i32 s20, s20, 3
+; SI-NEXT:    s_add_i32 s23, s23, 3
 ; SI-NEXT:    s_add_i32 s22, s22, 3
+; SI-NEXT:    s_add_i32 s25, s25, 3
 ; SI-NEXT:    s_add_i32 s24, s24, 3
+; SI-NEXT:    s_add_i32 s27, s27, 3
 ; SI-NEXT:    s_add_i32 s26, s26, 3
+; SI-NEXT:    s_add_i32 s29, s29, 3
 ; SI-NEXT:    s_add_i32 s28, s28, 3
-; SI-NEXT:    s_add_i32 s47, s47, 3
 ; SI-NEXT:    s_add_i32 s45, s45, 3
-; SI-NEXT:    s_add_i32 s43, s43, 3
-; SI-NEXT:    s_add_i32 s41, s41, 3
-; SI-NEXT:    s_add_i32 s15, s15, 3
-; SI-NEXT:    s_add_i32 s13, s13, 3
-; SI-NEXT:    s_add_i32 s11, s11, 3
-; SI-NEXT:    s_add_i32 s9, s9, 3
-; SI-NEXT:    s_add_i32 s7, s7, 3
-; SI-NEXT:    s_add_i32 s17, s17, 3
-; SI-NEXT:    s_add_i32 s19, s19, 3
-; SI-NEXT:    s_add_i32 s21, s21, 3
-; SI-NEXT:    s_add_i32 s23, s23, 3
-; SI-NEXT:    s_add_i32 s25, s25, 3
-; SI-NEXT:    s_add_i32 s27, s27, 3
-; SI-NEXT:    s_add_i32 s29, s29, 3
-; SI-NEXT:    s_add_i32 s46, s46, 3
 ; SI-NEXT:    s_add_i32 s44, s44, 3
+; SI-NEXT:    s_add_i32 s43, s43, 3
 ; SI-NEXT:    s_add_i32 s42, s42, 3
+; SI-NEXT:    s_add_i32 s41, s41, 3
 ; SI-NEXT:    s_add_i32 s40, s40, 3
+; SI-NEXT:    s_add_i32 s15, s15, 3
 ; SI-NEXT:    s_add_i32 s14, s14, 3
+; SI-NEXT:    s_add_i32 s13, s13, 3
 ; SI-NEXT:    s_add_i32 s12, s12, 3
+; SI-NEXT:    s_add_i32 s11, s11, 3
 ; SI-NEXT:    s_add_i32 s10, s10, 3
+; SI-NEXT:    s_add_i32 s9, s9, 3
 ; SI-NEXT:    s_add_i32 s8, s8, 3
+; SI-NEXT:    s_add_i32 s7, s7, 3
 ; SI-NEXT:    s_add_i32 s6, s6, 3
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s13
-; SI-NEXT:    v_mov_b32_e32 v5, s15
-; SI-NEXT:    v_mov_b32_e32 v6, s41
-; SI-NEXT:    v_mov_b32_e32 v7, s43
-; SI-NEXT:    v_mov_b32_e32 v8, s45
-; SI-NEXT:    v_mov_b32_e32 v9, s47
-; SI-NEXT:    v_mov_b32_e32 v10, s28
-; SI-NEXT:    v_mov_b32_e32 v11, s26
-; SI-NEXT:    v_mov_b32_e32 v12, s24
-; SI-NEXT:    v_mov_b32_e32 v13, s22
-; SI-NEXT:    v_mov_b32_e32 v14, s20
-; SI-NEXT:    v_mov_b32_e32 v15, s18
-; SI-NEXT:    v_mov_b32_e32 v16, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s10, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s12, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s14, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s40, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s42, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s44, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s46, v9, 16
-; SI-NEXT:    v_alignbit_b32 v10, s29, v10, 16
-; SI-NEXT:    v_alignbit_b32 v11, s27, v11, 16
-; SI-NEXT:    v_alignbit_b32 v12, s25, v12, 16
-; SI-NEXT:    v_alignbit_b32 v13, s23, v13, 16
-; SI-NEXT:    v_alignbit_b32 v14, s21, v14, 16
-; SI-NEXT:    v_alignbit_b32 v15, s19, v15, 16
-; SI-NEXT:    v_alignbit_b32 v16, s17, v16, 16
-; SI-NEXT:    s_lshr_b32 s56, s6, 16
-; SI-NEXT:    s_lshr_b32 s57, s8, 16
-; SI-NEXT:    s_lshr_b32 s58, s10, 16
-; SI-NEXT:    s_lshr_b32 s59, s12, 16
-; SI-NEXT:    s_lshr_b32 s60, s14, 16
-; SI-NEXT:    s_lshr_b32 s61, s40, 16
-; SI-NEXT:    s_lshr_b32 s62, s42, 16
-; SI-NEXT:    s_lshr_b32 s63, s44, 16
-; SI-NEXT:    s_lshr_b32 s72, s46, 16
-; SI-NEXT:    s_lshr_b32 s73, s29, 16
-; SI-NEXT:    s_lshr_b32 s74, s27, 16
-; SI-NEXT:    s_lshr_b32 s75, s25, 16
-; SI-NEXT:    s_lshr_b32 s76, s23, 16
-; SI-NEXT:    s_lshr_b32 s77, s21, 16
-; SI-NEXT:    s_lshr_b32 s78, s19, 16
-; SI-NEXT:    s_lshr_b32 s79, s17, 16
+; SI-NEXT:    s_add_i32 s5, s5, 3
+; SI-NEXT:    s_add_i32 s4, s4, 3
+; SI-NEXT:    s_lshr_b64 s[46:47], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[72:73], s[14:15], 16
+; SI-NEXT:    s_lshr_b64 s[74:75], s[40:41], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[42:43], 16
+; SI-NEXT:    s_lshr_b64 s[78:79], s[44:45], 16
+; SI-NEXT:    s_lshr_b64 s[88:89], s[28:29], 16
+; SI-NEXT:    s_lshr_b32 s38, s5, 16
+; SI-NEXT:    s_lshr_b32 s39, s7, 16
+; SI-NEXT:    s_lshr_b32 s48, s9, 16
+; SI-NEXT:    s_lshr_b32 s49, s11, 16
+; SI-NEXT:    s_lshr_b32 s50, s13, 16
+; SI-NEXT:    s_lshr_b32 s51, s15, 16
+; SI-NEXT:    s_lshr_b32 s52, s41, 16
+; SI-NEXT:    s_lshr_b32 s53, s43, 16
+; SI-NEXT:    s_lshr_b32 s54, s45, 16
+; SI-NEXT:    s_lshr_b32 s55, s29, 16
+; SI-NEXT:    s_lshr_b32 s64, s27, 16
+; SI-NEXT:    s_lshr_b32 s65, s25, 16
+; SI-NEXT:    s_lshr_b32 s66, s23, 16
+; SI-NEXT:    s_lshr_b32 s67, s21, 16
+; SI-NEXT:    s_lshr_b32 s68, s19, 16
+; SI-NEXT:    s_lshr_b32 s69, s17, 16
+; SI-NEXT:    s_lshr_b64 s[90:91], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[92:93], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[94:95], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[30:31], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[34:35], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[36:37], s[16:17], 16
 ; SI-NEXT:  .LBB25_3: ; %end
-; SI-NEXT:    s_and_b32 s4, s16, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; SI-NEXT:    v_or_b32_e32 v16, s4, v16
-; SI-NEXT:    s_and_b32 s4, s17, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s79, 16
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_mov_b32_e32 v17, s4
-; SI-NEXT:    s_and_b32 s4, s18, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; SI-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen
+; SI-NEXT:    s_lshl_b32 s47, s36, 16
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s47
+; SI-NEXT:    v_mov_b32_e32 v1, s16
+; SI-NEXT:    s_and_b32 s16, s17, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s69, 16
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_lshl_b32 s16, s34, 16
+; SI-NEXT:    s_and_b32 s17, s18, 0xffff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_mov_b32_e32 v3, s16
+; SI-NEXT:    s_and_b32 s16, s19, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s68, 16
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v4, s16
+; SI-NEXT:    s_lshl_b32 s16, s30, 16
+; SI-NEXT:    s_and_b32 s17, s20, 0xffff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v16, vcc, 4, v0
-; SI-NEXT:    v_or_b32_e32 v15, s4, v15
-; SI-NEXT:    s_and_b32 s4, s19, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s78, 16
-; SI-NEXT:    buffer_store_dword v17, v16, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v16, vcc, 8, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v16, s4
-; SI-NEXT:    s_and_b32 s4, s20, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; SI-NEXT:    v_mov_b32_e32 v5, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; SI-NEXT:    s_and_b32 s16, s21, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s67, 16
+; SI-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s22, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s94, 16
+; SI-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 20, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v15, vcc, 12, v0
-; SI-NEXT:    v_or_b32_e32 v14, s4, v14
-; SI-NEXT:    s_and_b32 s4, s21, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s77, 16
-; SI-NEXT:    buffer_store_dword v16, v15, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v15, vcc, 16, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v14, v15, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v15, s4
-; SI-NEXT:    s_and_b32 s4, s22, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s23, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s66, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 24, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 20, v0
-; SI-NEXT:    v_or_b32_e32 v13, s4, v13
-; SI-NEXT:    s_and_b32 s4, s23, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s76, 16
-; SI-NEXT:    buffer_store_dword v15, v14, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 24, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v14, s4
-; SI-NEXT:    s_and_b32 s4, s24, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s24, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s92, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 28, v0
-; SI-NEXT:    v_or_b32_e32 v12, s4, v12
-; SI-NEXT:    s_and_b32 s4, s25, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s75, 16
-; SI-NEXT:    buffer_store_dword v14, v13, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 32, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v12, v13, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v13, s4
-; SI-NEXT:    s_and_b32 s4, s26, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s25, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s65, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 36, v0
-; SI-NEXT:    v_or_b32_e32 v11, s4, v11
-; SI-NEXT:    s_and_b32 s4, s27, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s74, 16
-; SI-NEXT:    buffer_store_dword v13, v12, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 40, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v11, v12, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v12, s4
-; SI-NEXT:    s_and_b32 s4, s28, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s26, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s90, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 36, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 44, v0
-; SI-NEXT:    v_or_b32_e32 v10, s4, v10
-; SI-NEXT:    s_and_b32 s4, s29, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s73, 16
-; SI-NEXT:    buffer_store_dword v12, v11, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v11, s4
-; SI-NEXT:    s_and_b32 s4, s47, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s27, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s64, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 40, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 52, v0
-; SI-NEXT:    v_or_b32_e32 v9, s4, v9
-; SI-NEXT:    s_and_b32 s4, s46, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s72, 16
-; SI-NEXT:    buffer_store_dword v11, v10, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 56, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v10, s4
-; SI-NEXT:    s_and_b32 s4, s45, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s28, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s88, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 44, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 60, v0
-; SI-NEXT:    v_or_b32_e32 v8, s4, v8
-; SI-NEXT:    s_and_b32 s4, s44, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s63, 16
-; SI-NEXT:    buffer_store_dword v10, v9, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 64, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v8, v9, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v9, s4
-; SI-NEXT:    s_and_b32 s4, s43, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s29, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s55, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 48, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x44, v0
-; SI-NEXT:    v_or_b32_e32 v7, s4, v7
-; SI-NEXT:    s_and_b32 s4, s42, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s62, 16
-; SI-NEXT:    buffer_store_dword v9, v8, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v8, s4
-; SI-NEXT:    s_and_b32 s4, s41, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s44, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s78, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 52, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x4c, v0
-; SI-NEXT:    v_or_b32_e32 v6, s4, v6
-; SI-NEXT:    s_and_b32 s4, s40, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s61, 16
-; SI-NEXT:    buffer_store_dword v8, v7, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x50, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    s_and_b32 s4, s15, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s45, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s54, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 56, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x54, v0
-; SI-NEXT:    v_or_b32_e32 v5, s4, v5
-; SI-NEXT:    s_and_b32 s4, s14, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s60, 16
-; SI-NEXT:    buffer_store_dword v7, v6, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x58, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v6, s4
-; SI-NEXT:    s_and_b32 s4, s13, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s42, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s76, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 60, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x5c, v0
-; SI-NEXT:    v_or_b32_e32 v4, s4, v4
-; SI-NEXT:    s_and_b32 s4, s12, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s59, 16
-; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x60, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v5, s4
-; SI-NEXT:    s_and_b32 s4, s11, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s43, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s53, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 64, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x64, v0
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s10, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s58, 16
-; SI-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x68, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v4, s4
-; SI-NEXT:    s_and_b32 s4, s9, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s40, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s74, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x44, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x6c, v0
-; SI-NEXT:    v_or_b32_e32 v2, s4, v2
-; SI-NEXT:    s_and_b32 s4, s8, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s57, 16
-; SI-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x70, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v3, s4
-; SI-NEXT:    s_and_b32 s4, s7, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s41, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s52, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x48, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
-; SI-NEXT:    v_or_b32_e32 v1, s4, v1
-; SI-NEXT:    s_and_b32 s4, s6, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s56, 16
-; SI-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s14, s14, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s72, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x4c, v0
+; SI-NEXT:    s_or_b32 s14, s14, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s14
+; SI-NEXT:    s_and_b32 s14, s15, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s51, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x50, v0
+; SI-NEXT:    s_or_b32 s14, s14, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s14
+; SI-NEXT:    s_and_b32 s12, s12, 0xffff
+; SI-NEXT:    s_lshl_b32 s14, s62, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x54, v0
+; SI-NEXT:    s_or_b32 s12, s12, s14
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    s_and_b32 s12, s13, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s50, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x58, v0
+; SI-NEXT:    s_or_b32 s12, s12, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_lshl_b32 s12, s60, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x5c, v0
+; SI-NEXT:    s_or_b32 s10, s10, s12
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s10, s11, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s49, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x60, v0
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    s_lshl_b32 s10, s58, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x64, v0
+; SI-NEXT:    s_or_b32 s8, s8, s10
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s8, s9, 0xffff
+; SI-NEXT:    s_lshl_b32 s9, s48, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x68, v0
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_lshl_b32 s8, s56, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x6c, v0
+; SI-NEXT:    s_or_b32 s6, s6, s8
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s6, s7, 0xffff
+; SI-NEXT:    s_lshl_b32 s7, s39, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x70, v0
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; SI-NEXT:    s_lshl_b32 s6, s46, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x74, v0
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    s_and_b32 s4, s5, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s38, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x78, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    v_readlane_b32 s69, v20, 21
+; SI-NEXT:    v_readlane_b32 s68, v20, 20
+; SI-NEXT:    v_readlane_b32 s67, v20, 19
+; SI-NEXT:    v_readlane_b32 s66, v20, 18
+; SI-NEXT:    v_readlane_b32 s65, v20, 17
+; SI-NEXT:    v_readlane_b32 s64, v20, 16
+; SI-NEXT:    v_readlane_b32 s55, v20, 15
+; SI-NEXT:    v_readlane_b32 s54, v20, 14
+; SI-NEXT:    v_readlane_b32 s53, v20, 13
+; SI-NEXT:    v_readlane_b32 s52, v20, 12
+; SI-NEXT:    v_readlane_b32 s51, v20, 11
+; SI-NEXT:    v_readlane_b32 s50, v20, 10
+; SI-NEXT:    v_readlane_b32 s49, v20, 9
+; SI-NEXT:    v_readlane_b32 s48, v20, 8
+; SI-NEXT:    v_readlane_b32 s39, v20, 7
+; SI-NEXT:    v_readlane_b32 s38, v20, 6
+; SI-NEXT:    v_readlane_b32 s37, v20, 5
+; SI-NEXT:    v_readlane_b32 s36, v20, 4
+; SI-NEXT:    v_readlane_b32 s35, v20, 3
+; SI-NEXT:    v_readlane_b32 s34, v20, 2
+; SI-NEXT:    v_readlane_b32 s31, v20, 1
+; SI-NEXT:    v_readlane_b32 s30, v20, 0
+; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB25_4:
-; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $sgpr79
-; SI-NEXT:    ; implicit-def: $vgpr15
+; SI-NEXT:    ; implicit-def: $sgpr36
+; SI-NEXT:    ; implicit-def: $sgpr69
+; SI-NEXT:    ; implicit-def: $sgpr34
+; SI-NEXT:    ; implicit-def: $sgpr68
+; SI-NEXT:    ; implicit-def: $sgpr30
+; SI-NEXT:    ; implicit-def: $sgpr67
+; SI-NEXT:    ; implicit-def: $sgpr94
+; SI-NEXT:    ; implicit-def: $sgpr66
+; SI-NEXT:    ; implicit-def: $sgpr92
+; SI-NEXT:    ; implicit-def: $sgpr65
+; SI-NEXT:    ; implicit-def: $sgpr90
+; SI-NEXT:    ; implicit-def: $sgpr64
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    ; implicit-def: $sgpr55
 ; SI-NEXT:    ; implicit-def: $sgpr78
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $sgpr77
-; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $sgpr54
 ; SI-NEXT:    ; implicit-def: $sgpr76
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $sgpr75
-; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $sgpr53
 ; SI-NEXT:    ; implicit-def: $sgpr74
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $sgpr73
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $sgpr52
 ; SI-NEXT:    ; implicit-def: $sgpr72
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $sgpr63
-; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $sgpr51
 ; SI-NEXT:    ; implicit-def: $sgpr62
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $sgpr61
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $sgpr50
 ; SI-NEXT:    ; implicit-def: $sgpr60
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $sgpr59
-; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $sgpr49
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr39
+; SI-NEXT:    ; implicit-def: $sgpr38
 ; SI-NEXT:    ; implicit-def: $sgpr58
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $sgpr57
-; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; implicit-def: $sgpr56
+; SI-NEXT:    ; implicit-def: $sgpr46
 ; SI-NEXT:    s_branch .LBB25_2
 ;
 ; VI-LABEL: bitcast_v32i32_to_v64i16_scalar:
@@ -43268,887 +43650,1713 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-LABEL: bitcast_v32f32_to_v128i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_writelane_b32 v63, s30, 0
+; SI-NEXT:    v_writelane_b32 v63, s31, 1
+; SI-NEXT:    v_writelane_b32 v63, s34, 2
+; SI-NEXT:    v_writelane_b32 v63, s35, 3
+; SI-NEXT:    v_writelane_b32 v63, s36, 4
+; SI-NEXT:    v_writelane_b32 v63, s37, 5
+; SI-NEXT:    v_writelane_b32 v63, s38, 6
+; SI-NEXT:    v_writelane_b32 v63, s39, 7
+; SI-NEXT:    v_writelane_b32 v63, s48, 8
+; SI-NEXT:    v_writelane_b32 v63, s49, 9
+; SI-NEXT:    v_writelane_b32 v63, s50, 10
+; SI-NEXT:    v_writelane_b32 v63, s51, 11
+; SI-NEXT:    v_writelane_b32 v63, s52, 12
+; SI-NEXT:    v_writelane_b32 v63, s53, 13
+; SI-NEXT:    v_writelane_b32 v63, s54, 14
+; SI-NEXT:    v_writelane_b32 v63, s55, 15
+; SI-NEXT:    v_writelane_b32 v63, s64, 16
+; SI-NEXT:    v_writelane_b32 v63, s65, 17
+; SI-NEXT:    v_writelane_b32 v63, s66, 18
+; SI-NEXT:    v_writelane_b32 v63, s67, 19
+; SI-NEXT:    v_writelane_b32 v63, s68, 20
+; SI-NEXT:    v_writelane_b32 v63, s69, 21
+; SI-NEXT:    v_writelane_b32 v63, s70, 22
+; SI-NEXT:    v_writelane_b32 v63, s71, 23
+; SI-NEXT:    v_writelane_b32 v63, s80, 24
+; SI-NEXT:    v_writelane_b32 v63, s81, 25
+; SI-NEXT:    v_writelane_b32 v63, s82, 26
+; SI-NEXT:    v_writelane_b32 v63, s83, 27
+; SI-NEXT:    v_writelane_b32 v63, s84, 28
+; SI-NEXT:    v_writelane_b32 v63, s85, 29
+; SI-NEXT:    v_writelane_b32 v63, s86, 30
+; SI-NEXT:    v_writelane_b32 v63, s87, 31
+; SI-NEXT:    v_writelane_b32 v63, s96, 32
+; SI-NEXT:    v_writelane_b32 v63, s97, 33
+; SI-NEXT:    v_writelane_b32 v63, s98, 34
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v19
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v56, s16
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mov_b32_e32 v47, s17
-; SI-NEXT:    v_mov_b32_e32 v44, s18
-; SI-NEXT:    v_mov_b32_e32 v42, s19
-; SI-NEXT:    v_mov_b32_e32 v40, s20
-; SI-NEXT:    v_mov_b32_e32 v53, s21
-; SI-NEXT:    v_mov_b32_e32 v51, s22
-; SI-NEXT:    v_mov_b32_e32 v48, s23
-; SI-NEXT:    v_mov_b32_e32 v38, s24
-; SI-NEXT:    v_mov_b32_e32 v35, s25
-; SI-NEXT:    v_mov_b32_e32 v33, s26
-; SI-NEXT:    v_mov_b32_e32 v30, s27
-; SI-NEXT:    v_mov_b32_e32 v28, s28
-; SI-NEXT:    v_mov_b32_e32 v25, s29
-; SI-NEXT:    s_cbranch_scc0 .LBB37_4
+; SI-NEXT:    v_writelane_b32 v63, s99, 35
+; SI-NEXT:    v_readfirstlane_b32 s44, v1
+; SI-NEXT:    v_readfirstlane_b32 s45, v2
+; SI-NEXT:    v_readfirstlane_b32 s42, v3
+; SI-NEXT:    v_readfirstlane_b32 s43, v4
+; SI-NEXT:    v_readfirstlane_b32 s40, v5
+; SI-NEXT:    v_readfirstlane_b32 s41, v6
+; SI-NEXT:    v_readfirstlane_b32 s14, v7
+; SI-NEXT:    v_readfirstlane_b32 s15, v8
+; SI-NEXT:    v_readfirstlane_b32 s12, v9
+; SI-NEXT:    v_readfirstlane_b32 s13, v10
+; SI-NEXT:    v_readfirstlane_b32 s10, v11
+; SI-NEXT:    v_readfirstlane_b32 s11, v12
+; SI-NEXT:    v_readfirstlane_b32 s8, v13
+; SI-NEXT:    v_readfirstlane_b32 s9, v14
+; SI-NEXT:    v_readfirstlane_b32 s6, v15
+; SI-NEXT:    v_readfirstlane_b32 s7, v16
+; SI-NEXT:    v_readfirstlane_b32 s4, v17
+; SI-NEXT:    s_and_b64 s[46:47], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s5, v18
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr61 : SGPR spill to VGPR lane
+; SI-NEXT:    ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
+; SI-NEXT:    s_cbranch_scc0 .LBB37_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_alignbit_b32 v19, v18, v17, 24
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v18, v17, 16
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v18, v17, 8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v16, v15, 24
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v16, v15, 16
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v16, v15, 8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v14, v13, 24
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v14, v13, 16
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v14, v13, 8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v12, v11, 24
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v12, v11, 16
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v12, v11, 8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v10, v9, 24
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v10, v9, 16
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v10, v9, 8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v8, v7, 24
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v8, v7, 16
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v8, v7, 8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v6, v5, 24
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v6, v5, 16
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v6, v5, 8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v4, v3, 24
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v4, v3, 16
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v4, v3, 8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v2, v1, 24
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v2, v1, 16
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v2, v1, 8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v25, v28, 24
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v25, v28, 16
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v25, v28, 8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v30, v33, 24
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v30, v33, 16
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v30, v33, 8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v35, v38, 24
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v35, v38, 16
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v35, v38, 8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    s_lshr_b32 s46, s5, 24
+; SI-NEXT:    v_writelane_b32 v62, s46, 17
+; SI-NEXT:    s_lshr_b32 s46, s5, 16
+; SI-NEXT:    v_writelane_b32 v62, s46, 16
+; SI-NEXT:    s_lshr_b32 s46, s5, 8
+; SI-NEXT:    v_writelane_b32 v62, s46, 15
+; SI-NEXT:    s_lshr_b32 s46, s7, 24
+; SI-NEXT:    v_writelane_b32 v62, s46, 14
+; SI-NEXT:    s_lshr_b32 s46, s7, 16
+; SI-NEXT:    v_writelane_b32 v62, s46, 13
+; SI-NEXT:    s_lshr_b32 s46, s7, 8
+; SI-NEXT:    v_writelane_b32 v62, s46, 12
+; SI-NEXT:    s_lshr_b32 s46, s9, 24
+; SI-NEXT:    v_writelane_b32 v62, s46, 11
+; SI-NEXT:    s_lshr_b32 s46, s9, 16
+; SI-NEXT:    v_writelane_b32 v62, s46, 10
+; SI-NEXT:    s_lshr_b32 s46, s9, 8
+; SI-NEXT:    v_writelane_b32 v62, s46, 9
+; SI-NEXT:    s_lshr_b32 s46, s11, 24
+; SI-NEXT:    v_writelane_b32 v62, s46, 8
+; SI-NEXT:    s_lshr_b32 s46, s11, 16
+; SI-NEXT:    v_writelane_b32 v62, s46, 7
+; SI-NEXT:    s_lshr_b32 s46, s11, 8
+; SI-NEXT:    v_writelane_b32 v62, s46, 6
+; SI-NEXT:    s_lshr_b32 s46, s13, 24
+; SI-NEXT:    v_writelane_b32 v62, s46, 5
+; SI-NEXT:    s_lshr_b32 s46, s13, 16
+; SI-NEXT:    v_writelane_b32 v62, s46, 4
+; SI-NEXT:    s_lshr_b32 s46, s13, 8
+; SI-NEXT:    v_writelane_b32 v62, s46, 3
+; SI-NEXT:    s_lshr_b32 s46, s15, 24
+; SI-NEXT:    v_writelane_b32 v62, s46, 2
+; SI-NEXT:    s_lshr_b32 s46, s15, 16
+; SI-NEXT:    v_writelane_b32 v62, s46, 1
+; SI-NEXT:    s_lshr_b32 s46, s15, 8
+; SI-NEXT:    v_writelane_b32 v62, s46, 0
+; SI-NEXT:    s_lshr_b32 s46, s41, 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 63
+; SI-NEXT:    s_lshr_b32 s46, s41, 16
+; SI-NEXT:    v_writelane_b32 v61, s46, 62
+; SI-NEXT:    s_lshr_b32 s46, s41, 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 61
+; SI-NEXT:    s_lshr_b32 s46, s43, 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 60
+; SI-NEXT:    s_lshr_b32 s46, s43, 16
+; SI-NEXT:    v_writelane_b32 v61, s46, 59
+; SI-NEXT:    s_lshr_b32 s46, s43, 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 58
+; SI-NEXT:    s_lshr_b32 s46, s45, 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 57
+; SI-NEXT:    s_lshr_b32 s46, s45, 16
+; SI-NEXT:    v_writelane_b32 v61, s46, 56
+; SI-NEXT:    s_lshr_b32 s46, s45, 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 55
+; SI-NEXT:    s_lshr_b32 s46, s29, 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 54
+; SI-NEXT:    s_lshr_b32 s46, s29, 16
+; SI-NEXT:    v_writelane_b32 v61, s46, 53
+; SI-NEXT:    s_lshr_b32 s46, s29, 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 52
+; SI-NEXT:    s_lshr_b32 s46, s27, 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 51
+; SI-NEXT:    s_lshr_b32 s46, s27, 16
+; SI-NEXT:    v_writelane_b32 v61, s46, 50
+; SI-NEXT:    s_lshr_b32 s46, s27, 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 49
+; SI-NEXT:    s_lshr_b32 s46, s25, 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 48
+; SI-NEXT:    s_lshr_b32 s46, s25, 16
+; SI-NEXT:    v_writelane_b32 v61, s46, 47
+; SI-NEXT:    s_lshr_b32 s46, s25, 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 46
+; SI-NEXT:    s_lshr_b32 s46, s23, 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 45
+; SI-NEXT:    s_lshr_b32 s46, s23, 16
+; SI-NEXT:    v_writelane_b32 v61, s46, 44
+; SI-NEXT:    s_lshr_b32 s46, s23, 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 43
+; SI-NEXT:    s_lshr_b32 s46, s21, 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 42
+; SI-NEXT:    s_lshr_b32 s46, s21, 16
+; SI-NEXT:    v_writelane_b32 v61, s46, 41
+; SI-NEXT:    s_lshr_b32 s46, s21, 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 40
+; SI-NEXT:    s_lshr_b32 s46, s19, 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 39
+; SI-NEXT:    s_lshr_b32 s46, s19, 16
+; SI-NEXT:    v_writelane_b32 v61, s46, 38
+; SI-NEXT:    s_lshr_b32 s46, s19, 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 37
+; SI-NEXT:    s_lshr_b32 s46, s17, 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 36
+; SI-NEXT:    s_lshr_b32 s46, s17, 16
+; SI-NEXT:    v_writelane_b32 v61, s46, 35
+; SI-NEXT:    s_lshr_b32 s46, s17, 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 34
+; SI-NEXT:    s_lshr_b64 s[46:47], s[4:5], 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 32
+; SI-NEXT:    v_writelane_b32 v61, s47, 33
+; SI-NEXT:    s_lshr_b64 s[46:47], s[4:5], 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 30
+; SI-NEXT:    v_writelane_b32 v61, s47, 31
+; SI-NEXT:    s_lshr_b64 s[46:47], s[6:7], 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 28
+; SI-NEXT:    v_writelane_b32 v61, s47, 29
+; SI-NEXT:    s_lshr_b64 s[46:47], s[6:7], 16
+; SI-NEXT:    v_writelane_b32 v61, s46, 26
+; SI-NEXT:    v_writelane_b32 v61, s47, 27
+; SI-NEXT:    s_lshr_b64 s[46:47], s[6:7], 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 24
+; SI-NEXT:    v_writelane_b32 v61, s47, 25
+; SI-NEXT:    s_lshr_b64 s[46:47], s[8:9], 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 22
+; SI-NEXT:    v_writelane_b32 v61, s47, 23
+; SI-NEXT:    s_lshr_b64 s[46:47], s[8:9], 16
+; SI-NEXT:    v_writelane_b32 v61, s46, 20
+; SI-NEXT:    v_writelane_b32 v61, s47, 21
+; SI-NEXT:    s_lshr_b64 s[46:47], s[8:9], 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 18
+; SI-NEXT:    v_writelane_b32 v61, s47, 19
+; SI-NEXT:    s_lshr_b64 s[46:47], s[10:11], 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 16
+; SI-NEXT:    v_writelane_b32 v61, s47, 17
+; SI-NEXT:    s_lshr_b64 s[46:47], s[10:11], 16
+; SI-NEXT:    v_writelane_b32 v61, s46, 14
+; SI-NEXT:    v_writelane_b32 v61, s47, 15
+; SI-NEXT:    s_lshr_b64 s[46:47], s[10:11], 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 12
+; SI-NEXT:    v_writelane_b32 v61, s47, 13
+; SI-NEXT:    s_lshr_b64 s[46:47], s[12:13], 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 10
+; SI-NEXT:    v_writelane_b32 v61, s47, 11
+; SI-NEXT:    s_lshr_b64 s[46:47], s[12:13], 16
+; SI-NEXT:    v_writelane_b32 v61, s46, 8
+; SI-NEXT:    v_writelane_b32 v61, s47, 9
+; SI-NEXT:    s_lshr_b64 s[46:47], s[12:13], 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 6
+; SI-NEXT:    v_writelane_b32 v61, s47, 7
+; SI-NEXT:    s_lshr_b64 s[46:47], s[14:15], 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 4
+; SI-NEXT:    v_writelane_b32 v61, s47, 5
+; SI-NEXT:    s_lshr_b64 s[46:47], s[14:15], 16
+; SI-NEXT:    v_writelane_b32 v61, s46, 2
+; SI-NEXT:    v_writelane_b32 v61, s47, 3
+; SI-NEXT:    s_lshr_b64 s[46:47], s[14:15], 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 0
+; SI-NEXT:    s_lshr_b64 s[48:49], s[4:5], 16
+; SI-NEXT:    v_writelane_b32 v61, s47, 1
+; SI-NEXT:    s_lshr_b64 s[50:51], s[40:41], 24
+; SI-NEXT:    s_lshr_b64 s[52:53], s[40:41], 16
+; SI-NEXT:    s_lshr_b64 s[54:55], s[40:41], 8
+; SI-NEXT:    s_lshr_b64 s[64:65], s[42:43], 24
+; SI-NEXT:    s_lshr_b64 s[66:67], s[42:43], 16
+; SI-NEXT:    s_lshr_b64 s[68:69], s[42:43], 8
+; SI-NEXT:    s_lshr_b64 s[70:71], s[44:45], 24
+; SI-NEXT:    s_lshr_b64 s[80:81], s[44:45], 16
+; SI-NEXT:    s_lshr_b64 s[82:83], s[44:45], 8
+; SI-NEXT:    s_lshr_b64 s[84:85], s[28:29], 24
+; SI-NEXT:    s_lshr_b64 s[86:87], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[96:97], s[28:29], 8
+; SI-NEXT:    s_lshr_b64 s[98:99], s[26:27], 24
+; SI-NEXT:    s_lshr_b64 s[46:47], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[26:27], 8
+; SI-NEXT:    s_lshr_b64 s[58:59], s[24:25], 24
+; SI-NEXT:    s_lshr_b64 s[60:61], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[24:25], 8
+; SI-NEXT:    s_lshr_b64 s[72:73], s[22:23], 24
+; SI-NEXT:    s_lshr_b64 s[74:75], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[22:23], 8
+; SI-NEXT:    s_lshr_b64 s[78:79], s[20:21], 24
+; SI-NEXT:    s_lshr_b64 s[88:89], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[90:91], s[20:21], 8
+; SI-NEXT:    s_lshr_b64 s[92:93], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[94:95], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[30:31], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[34:35], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[36:37], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[38:39], s[16:17], 8
+; SI-NEXT:    s_cbranch_execnz .LBB37_4
+; SI-NEXT:  .LBB37_2: ; %cmp.true
+; SI-NEXT:    v_add_f32_e64 v2, s5, 1.0
+; SI-NEXT:    v_add_f32_e64 v1, s4, 1.0
+; SI-NEXT:    v_lshr_b64 v[13:14], v[1:2], 24
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[13:14], v[1:2], 16
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[13:14], v[1:2], 8
+; SI-NEXT:    v_add_f32_e64 v4, s7, 1.0
+; SI-NEXT:    v_add_f32_e64 v3, s6, 1.0
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[13:14], v[3:4], 24
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[13:14], v[3:4], 16
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[13:14], v[3:4], 8
+; SI-NEXT:    v_add_f32_e64 v6, s9, 1.0
+; SI-NEXT:    v_add_f32_e64 v5, s8, 1.0
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[13:14], v[5:6], 24
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[13:14], v[5:6], 16
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[13:14], v[5:6], 8
+; SI-NEXT:    v_add_f32_e64 v8, s11, 1.0
+; SI-NEXT:    v_add_f32_e64 v7, s10, 1.0
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[13:14], v[7:8], 24
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[13:14], v[7:8], 16
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v48, v51, 24
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[13:14], v[7:8], 8
+; SI-NEXT:    v_add_f32_e64 v10, s13, 1.0
+; SI-NEXT:    v_add_f32_e64 v9, s12, 1.0
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v48, v51, 16
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[13:14], v[9:10], 24
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v48, v51, 8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[13:14], v[9:10], 16
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 24, v18
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[13:14], v[9:10], 8
+; SI-NEXT:    v_add_f32_e64 v12, s15, 1.0
+; SI-NEXT:    v_add_f32_e64 v11, s14, 1.0
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 8, v18
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[13:14], v[11:12], 24
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 24, v16
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[13:14], v[11:12], 16
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v16
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[13:14], v[11:12], 8
+; SI-NEXT:    v_add_f32_e64 v16, s41, 1.0
+; SI-NEXT:    v_add_f32_e64 v15, s40, 1.0
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 8, v16
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[13:14], v[15:16], 24
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 24, v14
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[13:14], v[15:16], 16
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v14
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[13:14], v[15:16], 8
+; SI-NEXT:    v_add_f32_e64 v21, s43, 1.0
+; SI-NEXT:    v_add_f32_e64 v20, s42, 1.0
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 8, v14
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[13:14], v[20:21], 24
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 24, v12
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[13:14], v[20:21], 16
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v12
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[13:14], v[20:21], 8
+; SI-NEXT:    v_add_f32_e64 v26, s45, 1.0
+; SI-NEXT:    v_add_f32_e64 v25, s44, 1.0
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 8, v12
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[13:14], v[25:26], 24
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 24, v10
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[13:14], v[25:26], 16
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v10
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[13:14], v[25:26], 8
+; SI-NEXT:    v_add_f32_e64 v30, s29, 1.0
+; SI-NEXT:    v_add_f32_e64 v29, s28, 1.0
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[13:14], v[29:30], 24
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 24, v8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[13:14], v[29:30], 16
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[13:14], v[29:30], 8
+; SI-NEXT:    v_add_f32_e64 v36, s27, 1.0
+; SI-NEXT:    v_add_f32_e64 v35, s26, 1.0
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 8, v8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 24, v6
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 8, v6
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 24, v4
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v4
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 8, v4
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 24, v2
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v2
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 8, v2
-; SI-NEXT:    v_alignbit_b32 v23, v53, v40, 24
-; SI-NEXT:    v_alignbit_b32 v26, v53, v40, 16
-; SI-NEXT:    v_alignbit_b32 v29, v53, v40, 8
-; SI-NEXT:    v_alignbit_b32 v32, v42, v44, 24
-; SI-NEXT:    v_alignbit_b32 v36, v42, v44, 16
-; SI-NEXT:    v_alignbit_b32 v39, v42, v44, 8
-; SI-NEXT:    v_alignbit_b32 v50, v47, v56, 24
-; SI-NEXT:    v_alignbit_b32 v54, v47, v56, 16
-; SI-NEXT:    v_alignbit_b32 v41, v47, v56, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v18
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v24, 24, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 8, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v34, 24, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 8, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v52, 24, v35
-; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v35
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 8, v35
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 24, v48
-; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v48
-; SI-NEXT:    v_lshrrev_b32_e32 v57, 8, v48
-; SI-NEXT:    v_lshrrev_b32_e32 v58, 24, v53
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v53
-; SI-NEXT:    v_lshrrev_b32_e32 v60, 8, v53
-; SI-NEXT:    v_lshrrev_b32_e32 v61, 24, v42
-; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v42
-; SI-NEXT:    v_lshrrev_b32_e32 v63, 8, v42
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 24, v47
-; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v47
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 8, v47
-; SI-NEXT:    s_cbranch_execnz .LBB37_3
-; SI-NEXT:  .LBB37_2: ; %cmp.true
-; SI-NEXT:    v_add_f32_e32 v18, 1.0, v18
-; SI-NEXT:    v_add_f32_e32 v17, 1.0, v17
-; SI-NEXT:    v_alignbit_b32 v19, v18, v17, 24
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[13:14], v[35:36], 24
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v18, v17, 16
-; SI-NEXT:    v_add_f32_e32 v16, 1.0, v16
-; SI-NEXT:    v_add_f32_e32 v15, 1.0, v15
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[13:14], v[35:36], 16
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v18, v17, 8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[13:14], v[35:36], 8
+; SI-NEXT:    v_add_f32_e64 v49, s25, 1.0
+; SI-NEXT:    v_add_f32_e64 v48, s24, 1.0
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v16, v15, 24
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[13:14], v[48:49], 24
+; SI-NEXT:    v_lshr_b64 v[17:18], v[48:49], 8
+; SI-NEXT:    v_add_f32_e64 v53, s23, 1.0
+; SI-NEXT:    v_add_f32_e64 v52, s22, 1.0
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[17:18], v[52:53], 24
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[17:18], v[52:53], 16
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[13:14], v[48:49], 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v16, v15, 16
-; SI-NEXT:    v_add_f32_e32 v14, 1.0, v14
-; SI-NEXT:    v_add_f32_e32 v13, 1.0, v13
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[17:18], v[52:53], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v2
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v16, v15, 8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v14, v13, 24
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v14, v13, 16
-; SI-NEXT:    v_add_f32_e32 v12, 1.0, v12
-; SI-NEXT:    v_add_f32_e32 v11, 1.0, v11
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v4
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v14, v13, 8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v12, v11, 24
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 8, v4
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v12, v11, 16
-; SI-NEXT:    v_add_f32_e32 v10, 1.0, v10
-; SI-NEXT:    v_add_f32_e32 v9, 1.0, v9
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v6
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v12, v11, 8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v10, v9, 24
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 8, v6
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v10, v9, 16
-; SI-NEXT:    v_add_f32_e32 v8, 1.0, v8
-; SI-NEXT:    v_add_f32_e32 v7, 1.0, v7
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v8
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v10, v9, 8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v8
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v8, v7, 24
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 8, v8
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v8, v7, 16
-; SI-NEXT:    v_add_f32_e32 v6, 1.0, v6
-; SI-NEXT:    v_add_f32_e32 v5, 1.0, v5
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v10
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v8, v7, 8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v10
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v6, v5, 24
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 8, v10
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v6, v5, 16
-; SI-NEXT:    v_add_f32_e32 v4, 1.0, v4
-; SI-NEXT:    v_add_f32_e32 v3, 1.0, v3
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v12
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v6, v5, 8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v4, v3, 24
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 8, v12
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v4, v3, 16
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v16
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v4, v3, 8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v16
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v2, v1, 24
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 8, v16
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v2, v1, 16
-; SI-NEXT:    v_add_f32_e32 v25, 1.0, v25
-; SI-NEXT:    v_add_f32_e32 v28, 1.0, v28
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v21
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v21
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 8, v21
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v26
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v26
+; SI-NEXT:    v_add_f32_e64 v41, s21, 1.0
+; SI-NEXT:    v_add_f32_e64 v40, s20, 1.0
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 8, v26
+; SI-NEXT:    v_add_f32_e64 v58, s17, 1.0
+; SI-NEXT:    v_add_f32_e64 v57, s16, 1.0
+; SI-NEXT:    v_add_f32_e64 v46, s19, 1.0
+; SI-NEXT:    v_add_f32_e64 v45, s18, 1.0
+; SI-NEXT:    v_lshr_b64 v[31:32], v[40:41], 16
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v30
+; SI-NEXT:    v_lshr_b64 v[32:33], v[40:41], 8
+; SI-NEXT:    v_lshr_b64 v[37:38], v[45:46], 16
+; SI-NEXT:    v_lshr_b64 v[42:43], v[57:58], 16
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v30
+; SI-NEXT:    v_lshr_b64 v[27:28], v[40:41], 24
+; SI-NEXT:    v_lshr_b64 v[33:34], v[45:46], 24
+; SI-NEXT:    v_lshr_b64 v[38:39], v[45:46], 8
+; SI-NEXT:    v_lshr_b64 v[50:51], v[57:58], 24
+; SI-NEXT:    v_lshr_b64 v[43:44], v[57:58], 8
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 8, v30
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 24, v36
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v36
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 8, v36
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 24, v49
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v49
+; SI-NEXT:    v_lshrrev_b32_e32 v60, 8, v49
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v53
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v53
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 8, v53
+; SI-NEXT:    v_lshrrev_b32_e32 v24, 24, v41
+; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v41
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 8, v41
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 24, v46
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v46
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 8, v46
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 24, v58
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v58
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 8, v58
+; SI-NEXT:    s_branch .LBB37_5
+; SI-NEXT:  .LBB37_3:
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 0
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s49, 1
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr38
+; SI-NEXT:    ; implicit-def: $sgpr36
+; SI-NEXT:    ; implicit-def: $sgpr34
+; SI-NEXT:    ; implicit-def: $sgpr30
+; SI-NEXT:    ; implicit-def: $sgpr94
+; SI-NEXT:    ; implicit-def: $sgpr92
+; SI-NEXT:    ; implicit-def: $sgpr90
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    ; implicit-def: $sgpr78
+; SI-NEXT:    ; implicit-def: $sgpr76
+; SI-NEXT:    ; implicit-def: $sgpr74
+; SI-NEXT:    ; implicit-def: $sgpr72
+; SI-NEXT:    ; implicit-def: $sgpr62
+; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr58
+; SI-NEXT:    ; implicit-def: $sgpr56
+; SI-NEXT:    ; implicit-def: $sgpr98
+; SI-NEXT:    ; implicit-def: $sgpr96
+; SI-NEXT:    ; implicit-def: $sgpr86
+; SI-NEXT:    ; implicit-def: $sgpr84
+; SI-NEXT:    ; implicit-def: $sgpr82
+; SI-NEXT:    ; implicit-def: $sgpr80
+; SI-NEXT:    ; implicit-def: $sgpr70
+; SI-NEXT:    ; implicit-def: $sgpr68
+; SI-NEXT:    ; implicit-def: $sgpr66
+; SI-NEXT:    ; implicit-def: $sgpr64
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; implicit-def: $sgpr52
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 2
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s49, 3
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 4
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s49, 5
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 6
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s49, 7
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 8
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s49, 9
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 10
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s49, 11
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 12
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s49, 13
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 14
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s49, 15
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 16
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s49, 17
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 18
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s49, 19
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 20
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s49, 21
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 22
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s49, 23
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 24
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s49, 25
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 26
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s49, 27
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 28
+; SI-NEXT:    v_writelane_b32 v61, s49, 29
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 30
+; SI-NEXT:    v_writelane_b32 v61, s49, 31
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 32
+; SI-NEXT:    v_writelane_b32 v61, s49, 33
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    s_branch .LBB37_2
+; SI-NEXT:  .LBB37_4:
+; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 34
+; SI-NEXT:    v_mov_b32_e32 v54, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 35
+; SI-NEXT:    v_mov_b32_e32 v51, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 36
+; SI-NEXT:    v_mov_b32_e32 v39, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 37
+; SI-NEXT:    v_mov_b32_e32 v23, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 38
+; SI-NEXT:    v_mov_b32_e32 v22, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 39
+; SI-NEXT:    v_mov_b32_e32 v18, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 40
+; SI-NEXT:    v_mov_b32_e32 v34, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 41
+; SI-NEXT:    v_mov_b32_e32 v28, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 42
+; SI-NEXT:    v_mov_b32_e32 v24, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 43
+; SI-NEXT:    v_mov_b32_e32 v17, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 44
+; SI-NEXT:    v_mov_b32_e32 v19, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 45
+; SI-NEXT:    v_mov_b32_e32 v14, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 46
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v60, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 47
+; SI-NEXT:    v_mov_b32_e32 v59, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 48
+; SI-NEXT:    v_mov_b32_e32 v56, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 49
+; SI-NEXT:    v_mov_b32_e32 v47, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 50
+; SI-NEXT:    v_mov_b32_e32 v44, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 51
+; SI-NEXT:    v_mov_b32_e32 v55, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 52
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 53
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v2, v1, 8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 54
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v25, v28, 24
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 55
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v25, v28, 16
-; SI-NEXT:    v_add_f32_e32 v30, 1.0, v30
-; SI-NEXT:    v_add_f32_e32 v33, 1.0, v33
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 56
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v25, v28, 8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 57
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v30, v33, 24
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 58
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v30, v33, 16
-; SI-NEXT:    v_add_f32_e32 v35, 1.0, v35
-; SI-NEXT:    v_add_f32_e32 v38, 1.0, v38
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 59
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v30, v33, 8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 60
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v35, v38, 24
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 61
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v35, v38, 16
-; SI-NEXT:    v_add_f32_e32 v48, 1.0, v48
-; SI-NEXT:    v_add_f32_e32 v51, 1.0, v51
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 62
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v35, v38, 8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 63
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v48, v51, 24
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 0
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v48, v51, 16
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 1
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v48, v51, 8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 2
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 24, v18
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 3
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 8, v18
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 4
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 24, v16
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 5
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v16
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 6
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 8, v16
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 7
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 24, v14
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 8
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v14
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 9
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 8, v14
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 10
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 24, v12
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 11
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v12
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 12
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 8, v12
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 13
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 24, v10
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 14
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v10
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 15
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 8, v10
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 16
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 24, v8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 17
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 32
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 8, v8
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 24, v6
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 8, v6
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 24, v4
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v4
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 8, v4
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 24, v2
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v2
-; SI-NEXT:    v_add_f32_e32 v47, 1.0, v47
-; SI-NEXT:    v_add_f32_e32 v56, 1.0, v56
-; SI-NEXT:    v_add_f32_e32 v42, 1.0, v42
-; SI-NEXT:    v_add_f32_e32 v44, 1.0, v44
-; SI-NEXT:    v_add_f32_e32 v53, 1.0, v53
-; SI-NEXT:    v_add_f32_e32 v40, 1.0, v40
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 8, v2
-; SI-NEXT:    v_alignbit_b32 v23, v53, v40, 24
-; SI-NEXT:    v_alignbit_b32 v26, v53, v40, 16
-; SI-NEXT:    v_alignbit_b32 v29, v53, v40, 8
-; SI-NEXT:    v_alignbit_b32 v32, v42, v44, 24
-; SI-NEXT:    v_alignbit_b32 v36, v42, v44, 16
-; SI-NEXT:    v_alignbit_b32 v39, v42, v44, 8
-; SI-NEXT:    v_alignbit_b32 v50, v47, v56, 24
-; SI-NEXT:    v_alignbit_b32 v54, v47, v56, 16
-; SI-NEXT:    v_alignbit_b32 v41, v47, v56, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v18
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v24, 24, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 8, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v34, 24, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 8, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v52, 24, v35
-; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v35
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 8, v35
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 24, v48
-; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v48
-; SI-NEXT:    v_lshrrev_b32_e32 v57, 8, v48
-; SI-NEXT:    v_lshrrev_b32_e32 v58, 24, v53
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v53
-; SI-NEXT:    v_lshrrev_b32_e32 v60, 8, v53
-; SI-NEXT:    v_lshrrev_b32_e32 v61, 24, v42
-; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v42
-; SI-NEXT:    v_lshrrev_b32_e32 v63, 8, v42
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 24, v47
-; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v47
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 8, v47
-; SI-NEXT:  .LBB37_3: ; %end
-; SI-NEXT:    v_and_b32_e32 v56, 0xff, v56
-; SI-NEXT:    v_lshlrev_b32_e32 v41, 8, v41
-; SI-NEXT:    v_and_b32_e32 v54, 0xff, v54
-; SI-NEXT:    v_or_b32_e32 v41, v56, v41
-; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s48
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 33
+; SI-NEXT:    v_readlane_b32 s4, v61, 30
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 31
+; SI-NEXT:    v_readlane_b32 s4, v61, 28
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 29
+; SI-NEXT:    v_readlane_b32 s4, v61, 26
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 27
+; SI-NEXT:    v_readlane_b32 s4, v61, 24
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 25
+; SI-NEXT:    v_readlane_b32 s4, v61, 22
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 23
+; SI-NEXT:    v_readlane_b32 s4, v61, 20
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 21
+; SI-NEXT:    v_readlane_b32 s4, v61, 18
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 19
+; SI-NEXT:    v_readlane_b32 s4, v61, 16
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 17
+; SI-NEXT:    v_readlane_b32 s4, v61, 14
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 15
+; SI-NEXT:    v_readlane_b32 s4, v61, 12
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 13
+; SI-NEXT:    v_readlane_b32 s4, v61, 10
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 11
+; SI-NEXT:    v_readlane_b32 s4, v61, 8
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 9
+; SI-NEXT:    v_readlane_b32 s4, v61, 6
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 7
+; SI-NEXT:    v_readlane_b32 s4, v61, 4
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 5
+; SI-NEXT:    v_readlane_b32 s4, v61, 2
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 3
+; SI-NEXT:    v_readlane_b32 s4, v61, 0
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s50
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s52
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s54
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s64
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s66
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s68
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s70
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s80
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s82
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s84
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s86
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s96
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s98
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s46
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s56
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v13, s58
+; SI-NEXT:    v_mov_b32_e32 v27, s62
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v27, s72
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v27, s74
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v27, s76
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v57, s16
+; SI-NEXT:    v_mov_b32_e32 v58, s17
+; SI-NEXT:    v_mov_b32_e32 v45, s18
+; SI-NEXT:    v_mov_b32_e32 v46, s19
+; SI-NEXT:    v_mov_b32_e32 v40, s20
+; SI-NEXT:    v_mov_b32_e32 v41, s21
+; SI-NEXT:    v_mov_b32_e32 v52, s22
+; SI-NEXT:    v_mov_b32_e32 v53, s23
+; SI-NEXT:    v_mov_b32_e32 v48, s24
+; SI-NEXT:    v_mov_b32_e32 v49, s25
+; SI-NEXT:    v_mov_b32_e32 v35, s26
+; SI-NEXT:    v_mov_b32_e32 v36, s27
+; SI-NEXT:    v_mov_b32_e32 v29, s28
+; SI-NEXT:    v_mov_b32_e32 v30, s29
+; SI-NEXT:    v_mov_b32_e32 v25, s44
+; SI-NEXT:    v_mov_b32_e32 v26, s45
+; SI-NEXT:    v_mov_b32_e32 v20, s42
+; SI-NEXT:    v_mov_b32_e32 v21, s43
+; SI-NEXT:    v_mov_b32_e32 v15, s40
+; SI-NEXT:    v_mov_b32_e32 v16, s41
+; SI-NEXT:    v_mov_b32_e32 v11, s14
+; SI-NEXT:    v_mov_b32_e32 v12, s15
+; SI-NEXT:    v_mov_b32_e32 v9, s12
+; SI-NEXT:    v_mov_b32_e32 v10, s13
+; SI-NEXT:    v_mov_b32_e32 v7, s10
+; SI-NEXT:    v_mov_b32_e32 v8, s11
+; SI-NEXT:    v_mov_b32_e32 v5, s8
+; SI-NEXT:    v_mov_b32_e32 v6, s9
+; SI-NEXT:    v_mov_b32_e32 v3, s6
+; SI-NEXT:    v_mov_b32_e32 v4, s7
+; SI-NEXT:    v_readlane_b32 s5, v61, 1
+; SI-NEXT:    v_mov_b32_e32 v13, s60
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v27, s78
+; SI-NEXT:    v_mov_b32_e32 v31, s88
+; SI-NEXT:    v_mov_b32_e32 v32, s90
+; SI-NEXT:    v_mov_b32_e32 v33, s92
+; SI-NEXT:    v_mov_b32_e32 v37, s94
+; SI-NEXT:    v_mov_b32_e32 v38, s30
+; SI-NEXT:    v_mov_b32_e32 v50, s34
+; SI-NEXT:    v_mov_b32_e32 v42, s36
+; SI-NEXT:    v_mov_b32_e32 v43, s38
+; SI-NEXT:  .LBB37_5: ; %end
+; SI-NEXT:    v_lshlrev_b32_e32 v43, 8, v43
+; SI-NEXT:    v_and_b32_e32 v57, 0xff, v57
+; SI-NEXT:    v_and_b32_e32 v42, 0xff, v42
+; SI-NEXT:    v_or_b32_e32 v43, v57, v43
 ; SI-NEXT:    v_lshlrev_b32_e32 v50, 24, v50
-; SI-NEXT:    v_or_b32_e32 v50, v50, v54
-; SI-NEXT:    v_and_b32_e32 v54, 0xffff, v41
-; SI-NEXT:    v_or_b32_e32 v50, v54, v50
+; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v42
+; SI-NEXT:    v_or_b32_e32 v50, v50, v42
+; SI-NEXT:    v_and_b32_e32 v42, 0xffff, v43
+; SI-NEXT:    v_or_b32_e32 v50, v42, v50
 ; SI-NEXT:    buffer_store_dword v50, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v50, 0xff, v47
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 8, v21
-; SI-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; SI-NEXT:    v_or_b32_e32 v21, v50, v21
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 24, v19
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v21
-; SI-NEXT:    v_or_b32_e32 v19, v20, v19
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 4, v0
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v50, 0xff, v58
+; SI-NEXT:    v_lshlrev_b32_e32 v54, 8, v54
+; SI-NEXT:    v_and_b32_e32 v51, 0xff, v51
+; SI-NEXT:    v_or_b32_e32 v50, v50, v54
+; SI-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
+; SI-NEXT:    v_lshlrev_b32_e32 v39, 24, v39
+; SI-NEXT:    v_or_b32_e32 v39, v39, v51
+; SI-NEXT:    v_and_b32_e32 v50, 0xffff, v50
+; SI-NEXT:    v_or_b32_e32 v39, v50, v39
+; SI-NEXT:    v_add_i32_e32 v50, vcc, 4, v0
+; SI-NEXT:    buffer_store_dword v39, v50, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v38, 8, v38
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v39, 0xff, v45
+; SI-NEXT:    v_and_b32_e32 v37, 0xff, v37
+; SI-NEXT:    v_or_b32_e32 v38, v39, v38
+; SI-NEXT:    v_lshlrev_b32_e32 v33, 24, v33
+; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v37
+; SI-NEXT:    v_or_b32_e32 v33, v33, v37
+; SI-NEXT:    v_and_b32_e32 v37, 0xffff, v38
+; SI-NEXT:    v_or_b32_e32 v33, v37, v33
+; SI-NEXT:    v_add_i32_e32 v37, vcc, 8, v0
+; SI-NEXT:    buffer_store_dword v33, v37, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v19, 0xff, v44
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 8, v39
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_and_b32_e32 v20, 0xff, v36
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 24, v32
-; SI-NEXT:    v_or_b32_e32 v20, v21, v20
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 8, v0
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v33, 0xff, v46
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 8, v23
+; SI-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; SI-NEXT:    v_or_b32_e32 v23, v33, v23
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 24, v18
+; SI-NEXT:    v_or_b32_e32 v18, v18, v22
+; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v23
+; SI-NEXT:    v_or_b32_e32 v18, v22, v18
+; SI-NEXT:    v_add_i32_e32 v22, vcc, 12, v0
+; SI-NEXT:    buffer_store_dword v18, v22, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v32
+; SI-NEXT:    v_and_b32_e32 v22, 0xff, v40
+; SI-NEXT:    v_or_b32_e32 v18, v22, v18
+; SI-NEXT:    v_and_b32_e32 v22, 0xff, v31
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 24, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; SI-NEXT:    v_or_b32_e32 v22, v23, v22
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; SI-NEXT:    v_or_b32_e32 v18, v18, v22
+; SI-NEXT:    v_add_i32_e32 v22, vcc, 16, v0
+; SI-NEXT:    buffer_store_dword v18, v22, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v18, 0xff, v41
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 8, v34
+; SI-NEXT:    v_or_b32_e32 v18, v18, v22
+; SI-NEXT:    v_and_b32_e32 v22, 0xff, v28
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 24, v24
+; SI-NEXT:    v_or_b32_e32 v22, v23, v22
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; SI-NEXT:    v_or_b32_e32 v18, v18, v22
+; SI-NEXT:    v_add_i32_e32 v22, vcc, 20, v0
+; SI-NEXT:    buffer_store_dword v18, v22, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 24, v14
+; SI-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_readlane_b32 s99, v63, 35
+; SI-NEXT:    v_readlane_b32 s98, v63, 34
+; SI-NEXT:    v_readlane_b32 s97, v63, 33
+; SI-NEXT:    v_readlane_b32 s96, v63, 32
+; SI-NEXT:    v_readlane_b32 s87, v63, 31
+; SI-NEXT:    v_readlane_b32 s86, v63, 30
+; SI-NEXT:    v_readlane_b32 s85, v63, 29
+; SI-NEXT:    v_readlane_b32 s84, v63, 28
+; SI-NEXT:    v_readlane_b32 s83, v63, 27
+; SI-NEXT:    v_readlane_b32 s82, v63, 26
+; SI-NEXT:    v_readlane_b32 s81, v63, 25
+; SI-NEXT:    v_readlane_b32 s80, v63, 24
+; SI-NEXT:    v_readlane_b32 s71, v63, 23
+; SI-NEXT:    v_readlane_b32 s70, v63, 22
+; SI-NEXT:    v_readlane_b32 s69, v63, 21
+; SI-NEXT:    v_readlane_b32 s68, v63, 20
+; SI-NEXT:    v_readlane_b32 s67, v63, 19
+; SI-NEXT:    v_readlane_b32 s66, v63, 18
+; SI-NEXT:    v_readlane_b32 s65, v63, 17
+; SI-NEXT:    v_readlane_b32 s64, v63, 16
+; SI-NEXT:    v_readlane_b32 s55, v63, 15
+; SI-NEXT:    v_readlane_b32 s54, v63, 14
+; SI-NEXT:    v_readlane_b32 s53, v63, 13
+; SI-NEXT:    v_readlane_b32 s52, v63, 12
+; SI-NEXT:    v_readlane_b32 s51, v63, 11
+; SI-NEXT:    v_readlane_b32 s50, v63, 10
+; SI-NEXT:    v_readlane_b32 s49, v63, 9
+; SI-NEXT:    v_readlane_b32 s48, v63, 8
+; SI-NEXT:    v_readlane_b32 s39, v63, 7
+; SI-NEXT:    v_readlane_b32 s38, v63, 6
+; SI-NEXT:    v_readlane_b32 s37, v63, 5
+; SI-NEXT:    v_readlane_b32 s36, v63, 4
+; SI-NEXT:    v_readlane_b32 s35, v63, 3
+; SI-NEXT:    v_readlane_b32 s34, v63, 2
+; SI-NEXT:    v_readlane_b32 s31, v63, 1
+; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v22
+; SI-NEXT:    v_and_b32_e32 v22, 0xff, v52
+; SI-NEXT:    v_or_b32_e32 v18, v22, v18
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 24, v23
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; SI-NEXT:    v_or_b32_e32 v22, v23, v22
+; SI-NEXT:    v_or_b32_e32 v18, v18, v22
+; SI-NEXT:    v_add_i32_e32 v22, vcc, 24, v0
+; SI-NEXT:    buffer_store_dword v18, v22, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v19, 0xff, v42
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 8, v63
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_and_b32_e32 v20, 0xff, v62
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 24, v61
-; SI-NEXT:    v_or_b32_e32 v20, v21, v20
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 12, v0
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v18, 0xff, v53
+; SI-NEXT:    v_or_b32_e32 v17, v18, v17
+; SI-NEXT:    v_and_b32_e32 v18, 0xff, v19
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; SI-NEXT:    v_or_b32_e32 v14, v14, v18
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; SI-NEXT:    v_or_b32_e32 v14, v17, v14
+; SI-NEXT:    v_add_i32_e32 v17, vcc, 28, v0
+; SI-NEXT:    buffer_store_dword v14, v17, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 8, v17
+; SI-NEXT:    v_and_b32_e32 v17, 0xff, v48
+; SI-NEXT:    v_or_b32_e32 v14, v17, v14
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 24, v17
+; SI-NEXT:    v_or_b32_e32 v13, v17, v13
+; SI-NEXT:    v_or_b32_e32 v13, v14, v13
+; SI-NEXT:    v_add_i32_e32 v14, vcc, 32, v0
+; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v19, 0xff, v40
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 8, v29
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_and_b32_e32 v20, 0xff, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 24, v23
-; SI-NEXT:    v_or_b32_e32 v20, v21, v20
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 16, v0
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v13, 0xff, v49
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 8, v60
+; SI-NEXT:    v_or_b32_e32 v13, v13, v14
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v59
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 24, v56
+; SI-NEXT:    v_or_b32_e32 v14, v17, v14
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; SI-NEXT:    v_or_b32_e32 v13, v13, v14
+; SI-NEXT:    v_add_i32_e32 v14, vcc, 36, v0
+; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v19, 0xff, v53
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 8, v60
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_and_b32_e32 v20, 0xff, v59
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 24, v58
-; SI-NEXT:    v_or_b32_e32 v20, v21, v20
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 20, v0
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v35
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; SI-NEXT:    v_or_b32_e32 v13, v14, v13
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v17
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 24, v17
+; SI-NEXT:    v_or_b32_e32 v14, v17, v14
+; SI-NEXT:    v_or_b32_e32 v13, v13, v14
+; SI-NEXT:    v_add_i32_e32 v14, vcc, 40, v0
+; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v19, 0xff, v51
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v13, 0xff, v36
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 8, v47
+; SI-NEXT:    v_or_b32_e32 v13, v13, v14
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v44
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 24, v55
+; SI-NEXT:    v_or_b32_e32 v14, v17, v14
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; SI-NEXT:    v_or_b32_e32 v13, v13, v14
+; SI-NEXT:    v_add_i32_e32 v14, vcc, 44, v0
+; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v29
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; SI-NEXT:    v_or_b32_e32 v13, v14, v13
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 8, v20
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v17
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 24, v21
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 24, v17
+; SI-NEXT:    v_or_b32_e32 v14, v17, v14
+; SI-NEXT:    v_or_b32_e32 v13, v13, v14
+; SI-NEXT:    v_add_i32_e32 v14, vcc, 48, v0
+; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v13, 0xff, v30
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
+; SI-NEXT:    v_or_b32_e32 v13, v13, v14
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 24, v17
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT:    v_or_b32_e32 v20, v21, v20
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 24, v0
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_or_b32_e32 v14, v17, v14
+; SI-NEXT:    v_or_b32_e32 v13, v13, v14
+; SI-NEXT:    v_add_i32_e32 v14, vcc, 52, v0
+; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v19, 0xff, v48
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 8, v57
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_and_b32_e32 v20, 0xff, v46
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 24, v45
-; SI-NEXT:    v_or_b32_e32 v20, v21, v20
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 28, v0
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v25
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; SI-NEXT:    v_or_b32_e32 v13, v14, v13
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v17
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 24, v17
+; SI-NEXT:    v_or_b32_e32 v14, v17, v14
+; SI-NEXT:    v_or_b32_e32 v13, v13, v14
+; SI-NEXT:    v_add_i32_e32 v14, vcc, 56, v0
+; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v19, 0xff, v38
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v13, 0xff, v26
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 8, v20
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
+; SI-NEXT:    v_or_b32_e32 v13, v13, v14
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 24, v21
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 24, v17
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT:    v_or_b32_e32 v20, v21, v20
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 32, v0
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_or_b32_e32 v14, v17, v14
+; SI-NEXT:    v_or_b32_e32 v13, v13, v14
+; SI-NEXT:    v_add_i32_e32 v14, vcc, 60, v0
+; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v19, 0xff, v35
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 8, v43
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_and_b32_e32 v20, 0xff, v55
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 24, v52
-; SI-NEXT:    v_or_b32_e32 v20, v21, v20
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 36, v0
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v20
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; SI-NEXT:    v_or_b32_e32 v13, v14, v13
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v17
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 24, v17
+; SI-NEXT:    v_or_b32_e32 v14, v17, v14
+; SI-NEXT:    v_or_b32_e32 v13, v13, v14
+; SI-NEXT:    v_add_i32_e32 v14, vcc, 64, v0
+; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v19, 0xff, v33
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v13, 0xff, v21
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 8, v20
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
+; SI-NEXT:    v_or_b32_e32 v13, v13, v14
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 24, v21
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 24, v17
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT:    v_or_b32_e32 v20, v21, v20
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 40, v0
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_or_b32_e32 v14, v17, v14
+; SI-NEXT:    v_or_b32_e32 v13, v13, v14
+; SI-NEXT:    v_add_i32_e32 v14, vcc, 0x44, v0
+; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v19, 0xff, v30
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 8, v49
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_and_b32_e32 v20, 0xff, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 24, v34
-; SI-NEXT:    v_or_b32_e32 v20, v21, v20
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 44, v0
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v15
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; SI-NEXT:    v_or_b32_e32 v13, v14, v13
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 24, v17
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_or_b32_e32 v14, v15, v14
+; SI-NEXT:    v_or_b32_e32 v13, v13, v14
+; SI-NEXT:    v_add_i32_e32 v14, vcc, 0x48, v0
+; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v19, 0xff, v28
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v13, 0xff, v16
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 8, v20
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
+; SI-NEXT:    v_or_b32_e32 v13, v13, v14
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 24, v21
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 24, v15
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT:    v_or_b32_e32 v20, v21, v20
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 48, v0
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_or_b32_e32 v14, v15, v14
+; SI-NEXT:    v_or_b32_e32 v13, v13, v14
+; SI-NEXT:    v_add_i32_e32 v14, vcc, 0x4c, v0
+; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v19, 0xff, v25
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 8, v31
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_and_b32_e32 v20, 0xff, v27
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 24, v24
-; SI-NEXT:    v_or_b32_e32 v20, v21, v20
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 52, v0
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; SI-NEXT:    v_or_b32_e32 v11, v11, v13
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 24, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; SI-NEXT:    v_or_b32_e32 v13, v14, v13
+; SI-NEXT:    v_or_b32_e32 v11, v11, v13
+; SI-NEXT:    v_add_i32_e32 v13, vcc, 0x50, v0
+; SI-NEXT:    buffer_store_dword v11, v13, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v11, 0xff, v12
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 8, v19
-; SI-NEXT:    v_or_b32_e32 v1, v1, v19
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
+; SI-NEXT:    v_or_b32_e32 v11, v11, v12
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 24, v20
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v19, 0xff, v19
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; SI-NEXT:    v_or_b32_e32 v19, v20, v19
-; SI-NEXT:    v_or_b32_e32 v1, v1, v19
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 56, v0
-; SI-NEXT:    buffer_store_dword v1, v19, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; SI-NEXT:    v_or_b32_e32 v12, v13, v12
+; SI-NEXT:    v_or_b32_e32 v11, v11, v12
+; SI-NEXT:    v_add_i32_e32 v12, vcc, 0x54, v0
+; SI-NEXT:    buffer_store_dword v11, v12, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
+; SI-NEXT:    v_or_b32_e32 v9, v9, v11
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 24, v19
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v19, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
+; SI-NEXT:    v_or_b32_e32 v11, v12, v11
+; SI-NEXT:    v_or_b32_e32 v9, v9, v11
+; SI-NEXT:    v_add_i32_e32 v11, vcc, 0x58, v0
+; SI-NEXT:    buffer_store_dword v9, v11, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v9, 0xff, v10
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
+; SI-NEXT:    v_or_b32_e32 v9, v9, v10
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_or_b32_e32 v10, v11, v10
+; SI-NEXT:    v_or_b32_e32 v9, v9, v10
+; SI-NEXT:    v_add_i32_e32 v10, vcc, 0x5c, v0
+; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v4
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; SI-NEXT:    v_or_b32_e32 v7, v7, v9
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; SI-NEXT:    v_or_b32_e32 v9, v10, v9
+; SI-NEXT:    v_or_b32_e32 v7, v7, v9
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 0x60, v0
+; SI-NEXT:    buffer_store_dword v7, v9, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v5
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; SI-NEXT:    v_or_b32_e32 v7, v7, v8
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x48, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT:    v_or_b32_e32 v8, v9, v8
+; SI-NEXT:    v_or_b32_e32 v7, v7, v8
+; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x64, v0
+; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v6
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
+; SI-NEXT:    v_or_b32_e32 v5, v5, v7
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; SI-NEXT:    v_or_b32_e32 v7, v8, v7
+; SI-NEXT:    v_or_b32_e32 v5, v5, v7
+; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x68, v0
+; SI-NEXT:    buffer_store_dword v5, v7, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v7
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; SI-NEXT:    v_or_b32_e32 v5, v6, v5
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x50, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT:    v_or_b32_e32 v6, v7, v6
+; SI-NEXT:    v_or_b32_e32 v5, v5, v6
+; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x6c, v0
+; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v8
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; SI-NEXT:    v_or_b32_e32 v3, v3, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; SI-NEXT:    v_or_b32_e32 v5, v6, v5
+; SI-NEXT:    v_or_b32_e32 v3, v3, v5
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x70, v0
+; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v9
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; SI-NEXT:    v_or_b32_e32 v3, v3, v4
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_or_b32_e32 v4, v5, v4
+; SI-NEXT:    v_or_b32_e32 v3, v3, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x74, v0
+; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v10
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; SI-NEXT:    v_or_b32_e32 v3, v4, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x78, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
@@ -44157,331 +45365,27 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v11
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v12
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v13
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v14
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v15
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v16
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v17
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v18
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
-; SI-NEXT:  .LBB37_4:
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr41
-; SI-NEXT:    ; implicit-def: $vgpr54
-; SI-NEXT:    ; implicit-def: $vgpr50
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr63
-; SI-NEXT:    ; implicit-def: $vgpr62
-; SI-NEXT:    ; implicit-def: $vgpr61
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr60
-; SI-NEXT:    ; implicit-def: $vgpr59
-; SI-NEXT:    ; implicit-def: $vgpr58
-; SI-NEXT:    ; implicit-def: $vgpr57
-; SI-NEXT:    ; implicit-def: $vgpr46
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr43
-; SI-NEXT:    ; implicit-def: $vgpr55
-; SI-NEXT:    ; implicit-def: $vgpr52
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; implicit-def: $vgpr37
-; SI-NEXT:    ; implicit-def: $vgpr34
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; kill: killed $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    s_branch .LBB37_2
 ;
 ; VI-LABEL: bitcast_v32f32_to_v128i8_scalar:
 ; VI:       ; %bb.0:
@@ -71182,21 +72086,21 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a,
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v19
-; SI-NEXT:    v_mov_b32_e32 v36, s16
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mov_b32_e32 v35, s17
-; SI-NEXT:    v_mov_b32_e32 v33, s18
-; SI-NEXT:    v_mov_b32_e32 v32, s19
-; SI-NEXT:    v_mov_b32_e32 v31, s20
-; SI-NEXT:    v_mov_b32_e32 v29, s21
-; SI-NEXT:    v_mov_b32_e32 v28, s22
+; SI-NEXT:    v_mov_b32_e32 v31, s16
+; SI-NEXT:    v_mov_b32_e32 v32, s17
+; SI-NEXT:    v_mov_b32_e32 v29, s18
+; SI-NEXT:    v_mov_b32_e32 v30, s19
+; SI-NEXT:    v_mov_b32_e32 v27, s20
+; SI-NEXT:    v_mov_b32_e32 v28, s21
+; SI-NEXT:    v_mov_b32_e32 v25, s22
 ; SI-NEXT:    v_mov_b32_e32 v26, s23
-; SI-NEXT:    v_mov_b32_e32 v25, s24
+; SI-NEXT:    v_mov_b32_e32 v23, s24
 ; SI-NEXT:    v_mov_b32_e32 v24, s25
-; SI-NEXT:    v_mov_b32_e32 v22, s26
-; SI-NEXT:    v_mov_b32_e32 v21, s27
-; SI-NEXT:    v_mov_b32_e32 v20, s28
-; SI-NEXT:    v_mov_b32_e32 v19, s29
+; SI-NEXT:    v_mov_b32_e32 v21, s26
+; SI-NEXT:    v_mov_b32_e32 v22, s27
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    v_mov_b32_e32 v19, s28
+; SI-NEXT:    v_mov_b32_e32 v20, s29
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
@@ -71215,234 +72119,242 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB49_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_alignbit_b32 v23, v18, v17, 16
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v27, v16, v15, 16
-; SI-NEXT:    v_alignbit_b32 v30, v14, v13, 16
-; SI-NEXT:    v_alignbit_b32 v34, v12, v11, 16
-; SI-NEXT:    v_alignbit_b32 v37, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v38, v8, v7, 16
-; SI-NEXT:    v_alignbit_b32 v48, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v50, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v52, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v54, v19, v20, 16
-; SI-NEXT:    v_alignbit_b32 v41, v21, v22, 16
-; SI-NEXT:    v_alignbit_b32 v43, v24, v25, 16
-; SI-NEXT:    v_alignbit_b32 v45, v26, v28, 16
-; SI-NEXT:    v_alignbit_b32 v47, v29, v31, 16
-; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    v_alignbit_b32 v58, v32, v33, 16
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_alignbit_b32 v60, v35, v36, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v21
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v24
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v26
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v29
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v32
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v35
+; SI-NEXT:    v_lshr_b64 v[36:37], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[51:52], v[5:6], 16
+; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[36:37], v[15:16], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v18
+; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[36:37], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[52:53], v[21:22], 16
+; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[36:37], v[11:12], 16
+; SI-NEXT:    v_lshr_b64 v[37:38], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[38:39], v[7:8], 16
+; SI-NEXT:    v_mov_b32_e32 v53, v40
+; SI-NEXT:    v_lshr_b64 v[39:40], v[23:24], 16
+; SI-NEXT:    v_lshr_b64 v[40:41], v[25:26], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v16
+; SI-NEXT:    v_lshr_b64 v[54:55], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[41:42], v[27:28], 16
+; SI-NEXT:    v_mov_b32_e32 v55, v48
+; SI-NEXT:    v_lshr_b64 v[48:49], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[42:43], v[29:30], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v28
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v30
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v32
+; SI-NEXT:    v_lshr_b64 v[49:50], v[19:20], 16
+; SI-NEXT:    v_lshr_b64 v[43:44], v[31:32], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB49_3
 ; SI-NEXT:  .LBB49_2: ; %cmp.true
 ; SI-NEXT:    v_add_f32_e32 v18, 1.0, v18
 ; SI-NEXT:    v_add_f32_e32 v17, 1.0, v17
-; SI-NEXT:    v_add_f32_e32 v35, 1.0, v35
-; SI-NEXT:    v_add_f32_e32 v36, 1.0, v36
-; SI-NEXT:    v_add_f32_e32 v32, 1.0, v32
-; SI-NEXT:    v_add_f32_e32 v33, 1.0, v33
-; SI-NEXT:    v_add_f32_e32 v29, 1.0, v29
-; SI-NEXT:    v_add_f32_e32 v31, 1.0, v31
-; SI-NEXT:    v_add_f32_e32 v26, 1.0, v26
-; SI-NEXT:    v_add_f32_e32 v28, 1.0, v28
+; SI-NEXT:    v_add_f32_e32 v12, 1.0, v12
+; SI-NEXT:    v_add_f32_e32 v11, 1.0, v11
+; SI-NEXT:    v_lshr_b64 v[33:34], v[17:18], 16
+; SI-NEXT:    v_add_f32_e32 v10, 1.0, v10
+; SI-NEXT:    v_add_f32_e32 v9, 1.0, v9
+; SI-NEXT:    v_lshr_b64 v[36:37], v[11:12], 16
+; SI-NEXT:    v_add_f32_e32 v8, 1.0, v8
+; SI-NEXT:    v_add_f32_e32 v7, 1.0, v7
+; SI-NEXT:    v_add_f32_e32 v16, 1.0, v16
+; SI-NEXT:    v_add_f32_e32 v15, 1.0, v15
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[37:38], v[9:10], 16
 ; SI-NEXT:    v_add_f32_e32 v24, 1.0, v24
+; SI-NEXT:    v_add_f32_e32 v23, 1.0, v23
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[33:34], v[15:16], 16
+; SI-NEXT:    v_lshr_b64 v[38:39], v[7:8], 16
+; SI-NEXT:    v_add_f32_e32 v26, 1.0, v26
 ; SI-NEXT:    v_add_f32_e32 v25, 1.0, v25
-; SI-NEXT:    v_add_f32_e32 v21, 1.0, v21
-; SI-NEXT:    v_add_f32_e32 v22, 1.0, v22
-; SI-NEXT:    v_add_f32_e32 v19, 1.0, v19
-; SI-NEXT:    v_add_f32_e32 v20, 1.0, v20
+; SI-NEXT:    v_lshr_b64 v[39:40], v[23:24], 16
+; SI-NEXT:    v_add_f32_e32 v28, 1.0, v28
+; SI-NEXT:    v_add_f32_e32 v27, 1.0, v27
+; SI-NEXT:    v_add_f32_e32 v14, 1.0, v14
+; SI-NEXT:    v_add_f32_e32 v13, 1.0, v13
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[40:41], v[25:26], 16
+; SI-NEXT:    v_add_f32_e32 v30, 1.0, v30
+; SI-NEXT:    v_add_f32_e32 v29, 1.0, v29
 ; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_add_f32_e32 v4, 1.0, v4
-; SI-NEXT:    v_add_f32_e32 v3, 1.0, v3
 ; SI-NEXT:    v_add_f32_e32 v6, 1.0, v6
 ; SI-NEXT:    v_add_f32_e32 v5, 1.0, v5
-; SI-NEXT:    v_add_f32_e32 v8, 1.0, v8
-; SI-NEXT:    v_add_f32_e32 v7, 1.0, v7
-; SI-NEXT:    v_add_f32_e32 v10, 1.0, v10
-; SI-NEXT:    v_add_f32_e32 v9, 1.0, v9
-; SI-NEXT:    v_add_f32_e32 v12, 1.0, v12
-; SI-NEXT:    v_add_f32_e32 v11, 1.0, v11
-; SI-NEXT:    v_add_f32_e32 v14, 1.0, v14
-; SI-NEXT:    v_add_f32_e32 v13, 1.0, v13
-; SI-NEXT:    v_add_f32_e32 v16, 1.0, v16
-; SI-NEXT:    v_add_f32_e32 v15, 1.0, v15
-; SI-NEXT:    v_alignbit_b32 v23, v18, v17, 16
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v27, v16, v15, 16
-; SI-NEXT:    v_alignbit_b32 v30, v14, v13, 16
-; SI-NEXT:    v_alignbit_b32 v34, v12, v11, 16
-; SI-NEXT:    v_alignbit_b32 v37, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v38, v8, v7, 16
-; SI-NEXT:    v_alignbit_b32 v48, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v50, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v52, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v54, v19, v20, 16
-; SI-NEXT:    v_alignbit_b32 v41, v21, v22, 16
-; SI-NEXT:    v_alignbit_b32 v43, v24, v25, 16
-; SI-NEXT:    v_alignbit_b32 v45, v26, v28, 16
-; SI-NEXT:    v_alignbit_b32 v47, v29, v31, 16
-; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    v_alignbit_b32 v58, v32, v33, 16
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_alignbit_b32 v60, v35, v36, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v21
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v24
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v26
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v29
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[33:34], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[41:42], v[27:28], 16
+; SI-NEXT:    v_add_f32_e32 v32, 1.0, v32
+; SI-NEXT:    v_add_f32_e32 v31, 1.0, v31
+; SI-NEXT:    v_add_f32_e32 v22, 1.0, v22
+; SI-NEXT:    v_add_f32_e32 v21, 1.0, v21
+; SI-NEXT:    v_add_f32_e32 v20, 1.0, v20
+; SI-NEXT:    v_add_f32_e32 v19, 1.0, v19
+; SI-NEXT:    v_add_f32_e32 v4, 1.0, v4
+; SI-NEXT:    v_add_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_lshr_b64 v[51:52], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[48:49], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[42:43], v[29:30], 16
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[54:55], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[49:50], v[19:20], 16
+; SI-NEXT:    v_lshr_b64 v[52:53], v[21:22], 16
+; SI-NEXT:    v_lshr_b64 v[43:44], v[31:32], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v22
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v32
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v35
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v28
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v30
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v32
 ; SI-NEXT:  .LBB49_3: ; %end
-; SI-NEXT:    v_and_b32_e32 v36, 0xffff, v36
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v60
-; SI-NEXT:    v_and_b32_e32 v35, 0xffff, v35
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; SI-NEXT:    v_or_b32_e32 v36, v36, v60
-; SI-NEXT:    v_or_b32_e32 v23, v35, v23
-; SI-NEXT:    v_add_i32_e32 v35, vcc, 4, v0
-; SI-NEXT:    buffer_store_dword v36, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v23, v35, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v33
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v58
-; SI-NEXT:    v_or_b32_e32 v23, v23, v33
-; SI-NEXT:    v_add_i32_e32 v33, vcc, 8, v0
-; SI-NEXT:    buffer_store_dword v23, v33, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v32
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v63
-; SI-NEXT:    v_or_b32_e32 v23, v23, v32
-; SI-NEXT:    v_add_i32_e32 v32, vcc, 12, v0
-; SI-NEXT:    buffer_store_dword v23, v32, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v31
-; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v47
-; SI-NEXT:    v_or_b32_e32 v23, v23, v31
-; SI-NEXT:    v_add_i32_e32 v31, vcc, 16, v0
-; SI-NEXT:    buffer_store_dword v23, v31, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v62
-; SI-NEXT:    v_or_b32_e32 v23, v23, v29
-; SI-NEXT:    v_add_i32_e32 v29, vcc, 20, v0
-; SI-NEXT:    buffer_store_dword v23, v29, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v28
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v43
+; SI-NEXT:    v_and_b32_e32 v31, 0xffff, v31
+; SI-NEXT:    v_or_b32_e32 v31, v31, v50
+; SI-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v31, 0xffff, v32
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v35
+; SI-NEXT:    v_or_b32_e32 v31, v31, v32
+; SI-NEXT:    v_add_i32_e32 v32, vcc, 4, v0
+; SI-NEXT:    buffer_store_dword v31, v32, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v42
+; SI-NEXT:    v_and_b32_e32 v29, 0xffff, v29
+; SI-NEXT:    v_or_b32_e32 v29, v29, v31
+; SI-NEXT:    v_add_i32_e32 v31, vcc, 8, v0
+; SI-NEXT:    buffer_store_dword v29, v31, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v29, 0xffff, v30
+; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v46
+; SI-NEXT:    v_or_b32_e32 v29, v29, v30
+; SI-NEXT:    v_add_i32_e32 v30, vcc, 12, v0
+; SI-NEXT:    buffer_store_dword v29, v30, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v41
+; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; SI-NEXT:    v_or_b32_e32 v27, v27, v29
+; SI-NEXT:    v_add_i32_e32 v29, vcc, 16, v0
+; SI-NEXT:    buffer_store_dword v27, v29, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v28
 ; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v45
-; SI-NEXT:    v_or_b32_e32 v23, v23, v28
-; SI-NEXT:    v_add_i32_e32 v28, vcc, 24, v0
-; SI-NEXT:    buffer_store_dword v23, v28, s[0:3], 0 offen
+; SI-NEXT:    v_or_b32_e32 v27, v27, v28
+; SI-NEXT:    v_add_i32_e32 v28, vcc, 20, v0
+; SI-NEXT:    buffer_store_dword v27, v28, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v40
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; SI-NEXT:    v_or_b32_e32 v25, v25, v27
+; SI-NEXT:    v_add_i32_e32 v27, vcc, 24, v0
+; SI-NEXT:    buffer_store_dword v25, v27, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v61
-; SI-NEXT:    v_or_b32_e32 v23, v23, v26
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v26
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v34
+; SI-NEXT:    v_or_b32_e32 v25, v25, v26
 ; SI-NEXT:    v_add_i32_e32 v26, vcc, 28, v0
-; SI-NEXT:    buffer_store_dword v23, v26, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v25, v26, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v25
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v43
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v39
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v25
 ; SI-NEXT:    v_add_i32_e32 v25, vcc, 32, v0
 ; SI-NEXT:    buffer_store_dword v23, v25, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v59
+; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v33
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v24
 ; SI-NEXT:    v_add_i32_e32 v24, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v23, v24, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v21
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v41
-; SI-NEXT:    v_or_b32_e32 v22, v22, v23
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v52
+; SI-NEXT:    v_or_b32_e32 v21, v21, v23
 ; SI-NEXT:    v_add_i32_e32 v23, vcc, 40, v0
-; SI-NEXT:    buffer_store_dword v22, v23, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; SI-NEXT:    buffer_store_dword v21, v23, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v57
+; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v63
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v22
 ; SI-NEXT:    v_add_i32_e32 v22, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v21, v22, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v54
-; SI-NEXT:    v_or_b32_e32 v20, v20, v21
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v49
+; SI-NEXT:    v_or_b32_e32 v19, v19, v21
 ; SI-NEXT:    v_add_i32_e32 v21, vcc, 48, v0
-; SI-NEXT:    buffer_store_dword v20, v21, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; SI-NEXT:    buffer_store_dword v19, v21, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v56
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v20
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v62
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v20
 ; SI-NEXT:    v_add_i32_e32 v20, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v52
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v48
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v19
 ; SI-NEXT:    v_add_i32_e32 v19, vcc, 56, v0
 ; SI-NEXT:    buffer_store_dword v1, v19, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v46
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v61
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v50
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v54
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v44
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v60
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v48
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v51
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x48, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v42
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v59
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -71454,7 +72366,7 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v58
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -71466,57 +72378,64 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v55
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v57
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v36
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v53
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v56
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v30
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v51
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v47
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v27
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v49
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v55
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v17
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v39
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v53
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
@@ -71539,39 +72458,44 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a,
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB49_4:
+; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    ; implicit-def: $vgpr50
+; SI-NEXT:    ; implicit-def: $vgpr43
+; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    ; implicit-def: $vgpr42
+; SI-NEXT:    ; implicit-def: $vgpr46
+; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; implicit-def: $vgpr40
+; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    ; implicit-def: $vgpr60
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr58
+; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr52
 ; SI-NEXT:    ; implicit-def: $vgpr63
-; SI-NEXT:    ; implicit-def: $vgpr47
+; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; implicit-def: $vgpr62
-; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr61
-; SI-NEXT:    ; implicit-def: $vgpr43
+; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; implicit-def: $vgpr59
-; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    ; implicit-def: $vgpr58
 ; SI-NEXT:    ; implicit-def: $vgpr57
-; SI-NEXT:    ; implicit-def: $vgpr54
 ; SI-NEXT:    ; implicit-def: $vgpr56
-; SI-NEXT:    ; implicit-def: $vgpr52
-; SI-NEXT:    ; implicit-def: $vgpr46
-; SI-NEXT:    ; implicit-def: $vgpr50
-; SI-NEXT:    ; implicit-def: $vgpr44
-; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    ; implicit-def: $vgpr42
-; SI-NEXT:    ; implicit-def: $vgpr38
-; SI-NEXT:    ; implicit-def: $vgpr40
-; SI-NEXT:    ; implicit-def: $vgpr37
+; SI-NEXT:    ; implicit-def: $vgpr47
 ; SI-NEXT:    ; implicit-def: $vgpr55
-; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr53
-; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    ; implicit-def: $vgpr51
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; kill: killed $vgpr39
-; SI-NEXT:    ; implicit-def: $vgpr39
+; SI-NEXT:    ; implicit-def: $vgpr54
+; SI-NEXT:    ; implicit-def: $vgpr38
+; SI-NEXT:    ; implicit-def: $vgpr37
+; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr50
+; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr50
+; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    s_branch .LBB49_2
 ;
 ; VI-LABEL: bitcast_v32f32_to_v64i16_scalar:
@@ -79027,884 +79951,1220 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3
 ; SI-LABEL: bitcast_v16i64_to_v128i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v41, s30, 0
-; SI-NEXT:    v_writelane_b32 v41, s31, 1
-; SI-NEXT:    v_writelane_b32 v41, s34, 2
-; SI-NEXT:    v_writelane_b32 v41, s35, 3
-; SI-NEXT:    v_writelane_b32 v41, s36, 4
-; SI-NEXT:    v_writelane_b32 v41, s37, 5
-; SI-NEXT:    v_writelane_b32 v41, s38, 6
-; SI-NEXT:    v_writelane_b32 v41, s39, 7
-; SI-NEXT:    v_writelane_b32 v41, s48, 8
-; SI-NEXT:    v_writelane_b32 v41, s49, 9
-; SI-NEXT:    v_writelane_b32 v41, s50, 10
-; SI-NEXT:    v_writelane_b32 v41, s51, 11
-; SI-NEXT:    v_writelane_b32 v41, s52, 12
-; SI-NEXT:    v_writelane_b32 v41, s53, 13
-; SI-NEXT:    v_writelane_b32 v41, s54, 14
-; SI-NEXT:    v_writelane_b32 v41, s55, 15
-; SI-NEXT:    v_writelane_b32 v41, s64, 16
-; SI-NEXT:    v_writelane_b32 v41, s65, 17
-; SI-NEXT:    v_writelane_b32 v41, s66, 18
-; SI-NEXT:    v_writelane_b32 v41, s67, 19
-; SI-NEXT:    v_writelane_b32 v41, s68, 20
-; SI-NEXT:    v_writelane_b32 v41, s69, 21
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_writelane_b32 v20, s30, 0
+; SI-NEXT:    v_writelane_b32 v20, s31, 1
+; SI-NEXT:    v_writelane_b32 v20, s34, 2
+; SI-NEXT:    v_writelane_b32 v20, s35, 3
+; SI-NEXT:    v_writelane_b32 v20, s36, 4
+; SI-NEXT:    v_writelane_b32 v20, s37, 5
+; SI-NEXT:    v_writelane_b32 v20, s38, 6
+; SI-NEXT:    v_writelane_b32 v20, s39, 7
+; SI-NEXT:    v_writelane_b32 v20, s48, 8
+; SI-NEXT:    v_writelane_b32 v20, s49, 9
+; SI-NEXT:    v_writelane_b32 v20, s50, 10
+; SI-NEXT:    v_writelane_b32 v20, s51, 11
+; SI-NEXT:    v_writelane_b32 v20, s52, 12
+; SI-NEXT:    v_writelane_b32 v20, s53, 13
+; SI-NEXT:    v_writelane_b32 v20, s54, 14
+; SI-NEXT:    v_writelane_b32 v20, s55, 15
+; SI-NEXT:    v_writelane_b32 v20, s64, 16
+; SI-NEXT:    v_writelane_b32 v20, s65, 17
+; SI-NEXT:    v_writelane_b32 v20, s66, 18
+; SI-NEXT:    v_writelane_b32 v20, s67, 19
+; SI-NEXT:    v_writelane_b32 v20, s68, 20
+; SI-NEXT:    v_writelane_b32 v20, s69, 21
+; SI-NEXT:    v_writelane_b32 v20, s70, 22
+; SI-NEXT:    v_writelane_b32 v20, s71, 23
+; SI-NEXT:    v_writelane_b32 v20, s80, 24
+; SI-NEXT:    v_writelane_b32 v20, s81, 25
+; SI-NEXT:    v_writelane_b32 v20, s82, 26
+; SI-NEXT:    v_writelane_b32 v20, s83, 27
+; SI-NEXT:    v_writelane_b32 v20, s84, 28
+; SI-NEXT:    v_writelane_b32 v20, s85, 29
+; SI-NEXT:    v_writelane_b32 v20, s86, 30
+; SI-NEXT:    v_writelane_b32 v20, s87, 31
+; SI-NEXT:    v_writelane_b32 v20, s96, 32
+; SI-NEXT:    v_writelane_b32 v20, s97, 33
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v19
-; SI-NEXT:    v_writelane_b32 v41, s70, 22
-; SI-NEXT:    v_readfirstlane_b32 s47, v1
-; SI-NEXT:    v_readfirstlane_b32 s46, v2
-; SI-NEXT:    v_readfirstlane_b32 s45, v3
-; SI-NEXT:    v_readfirstlane_b32 s44, v4
-; SI-NEXT:    v_readfirstlane_b32 s43, v5
-; SI-NEXT:    v_readfirstlane_b32 s42, v6
-; SI-NEXT:    v_readfirstlane_b32 s41, v7
-; SI-NEXT:    v_readfirstlane_b32 s40, v8
-; SI-NEXT:    v_readfirstlane_b32 s15, v9
-; SI-NEXT:    v_readfirstlane_b32 s14, v10
-; SI-NEXT:    v_readfirstlane_b32 s13, v11
-; SI-NEXT:    v_readfirstlane_b32 s12, v12
-; SI-NEXT:    v_readfirstlane_b32 s11, v13
-; SI-NEXT:    v_readfirstlane_b32 s10, v14
-; SI-NEXT:    v_readfirstlane_b32 s9, v15
-; SI-NEXT:    v_readfirstlane_b32 s8, v16
-; SI-NEXT:    v_readfirstlane_b32 s7, v17
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_readfirstlane_b32 s6, v18
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_writelane_b32 v41, s71, 23
+; SI-NEXT:    v_writelane_b32 v20, s98, 34
+; SI-NEXT:    v_readfirstlane_b32 s44, v1
+; SI-NEXT:    v_readfirstlane_b32 s45, v2
+; SI-NEXT:    v_readfirstlane_b32 s42, v3
+; SI-NEXT:    v_readfirstlane_b32 s43, v4
+; SI-NEXT:    v_readfirstlane_b32 s40, v5
+; SI-NEXT:    v_readfirstlane_b32 s41, v6
+; SI-NEXT:    v_readfirstlane_b32 s14, v7
+; SI-NEXT:    v_readfirstlane_b32 s15, v8
+; SI-NEXT:    v_readfirstlane_b32 s12, v9
+; SI-NEXT:    v_readfirstlane_b32 s13, v10
+; SI-NEXT:    v_readfirstlane_b32 s10, v11
+; SI-NEXT:    v_readfirstlane_b32 s11, v12
+; SI-NEXT:    v_readfirstlane_b32 s8, v13
+; SI-NEXT:    v_readfirstlane_b32 s9, v14
+; SI-NEXT:    v_readfirstlane_b32 s6, v15
+; SI-NEXT:    v_readfirstlane_b32 s7, v16
+; SI-NEXT:    v_readfirstlane_b32 s4, v17
+; SI-NEXT:    s_and_b64 s[46:47], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s5, v18
+; SI-NEXT:    v_writelane_b32 v20, s99, 35
+; SI-NEXT:    ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
+; SI-NEXT:    ; implicit-def: $vgpr21 : SGPR spill to VGPR lane
 ; SI-NEXT:    s_cbranch_scc0 .LBB57_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v22, s45
-; SI-NEXT:    v_mov_b32_e32 v3, s7
-; SI-NEXT:    v_mov_b32_e32 v6, s9
-; SI-NEXT:    v_mov_b32_e32 v9, s11
-; SI-NEXT:    v_mov_b32_e32 v12, s13
-; SI-NEXT:    v_mov_b32_e32 v15, s15
-; SI-NEXT:    v_mov_b32_e32 v18, s41
-; SI-NEXT:    v_mov_b32_e32 v21, s43
-; SI-NEXT:    v_alignbit_b32 v24, s44, v22, 24
-; SI-NEXT:    v_alignbit_b32 v25, s44, v22, 16
-; SI-NEXT:    v_alignbit_b32 v26, s44, v22, 8
-; SI-NEXT:    v_mov_b32_e32 v22, s47
-; SI-NEXT:    v_mov_b32_e32 v23, s28
-; SI-NEXT:    v_mov_b32_e32 v29, s26
-; SI-NEXT:    v_mov_b32_e32 v35, s24
-; SI-NEXT:    v_mov_b32_e32 v39, s22
-; SI-NEXT:    v_mov_b32_e32 v50, s20
-; SI-NEXT:    v_mov_b32_e32 v53, s18
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v40, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v3, 24
-; SI-NEXT:    v_alignbit_b32 v2, s6, v3, 16
-; SI-NEXT:    v_alignbit_b32 v3, s6, v3, 8
-; SI-NEXT:    v_alignbit_b32 v4, s8, v6, 24
-; SI-NEXT:    v_alignbit_b32 v5, s8, v6, 16
-; SI-NEXT:    v_alignbit_b32 v6, s8, v6, 8
-; SI-NEXT:    v_alignbit_b32 v7, s10, v9, 24
-; SI-NEXT:    v_alignbit_b32 v8, s10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v9, s10, v9, 8
-; SI-NEXT:    v_alignbit_b32 v10, s12, v12, 24
-; SI-NEXT:    v_alignbit_b32 v11, s12, v12, 16
-; SI-NEXT:    v_alignbit_b32 v12, s12, v12, 8
-; SI-NEXT:    v_alignbit_b32 v13, s14, v15, 24
-; SI-NEXT:    v_alignbit_b32 v14, s14, v15, 16
-; SI-NEXT:    v_alignbit_b32 v15, s14, v15, 8
-; SI-NEXT:    v_alignbit_b32 v16, s40, v18, 24
-; SI-NEXT:    v_alignbit_b32 v17, s40, v18, 16
-; SI-NEXT:    v_alignbit_b32 v18, s40, v18, 8
-; SI-NEXT:    v_alignbit_b32 v19, s42, v21, 24
-; SI-NEXT:    v_alignbit_b32 v20, s42, v21, 16
-; SI-NEXT:    v_alignbit_b32 v21, s42, v21, 8
-; SI-NEXT:    v_alignbit_b32 v30, s46, v22, 24
-; SI-NEXT:    v_alignbit_b32 v31, s46, v22, 16
-; SI-NEXT:    v_alignbit_b32 v32, s46, v22, 8
-; SI-NEXT:    v_alignbit_b32 v36, s29, v23, 24
-; SI-NEXT:    v_alignbit_b32 v22, s29, v23, 16
-; SI-NEXT:    v_alignbit_b32 v23, s29, v23, 8
-; SI-NEXT:    v_alignbit_b32 v27, s27, v29, 24
-; SI-NEXT:    v_alignbit_b32 v28, s27, v29, 16
-; SI-NEXT:    v_alignbit_b32 v29, s27, v29, 8
-; SI-NEXT:    v_alignbit_b32 v33, s25, v35, 24
-; SI-NEXT:    v_alignbit_b32 v34, s25, v35, 16
-; SI-NEXT:    v_alignbit_b32 v35, s25, v35, 8
-; SI-NEXT:    v_alignbit_b32 v37, s23, v39, 24
-; SI-NEXT:    v_alignbit_b32 v38, s23, v39, 16
-; SI-NEXT:    v_alignbit_b32 v39, s23, v39, 8
-; SI-NEXT:    v_alignbit_b32 v48, s21, v50, 24
-; SI-NEXT:    v_alignbit_b32 v49, s21, v50, 16
-; SI-NEXT:    v_alignbit_b32 v50, s21, v50, 8
-; SI-NEXT:    v_alignbit_b32 v51, s19, v53, 24
-; SI-NEXT:    v_alignbit_b32 v52, s19, v53, 16
-; SI-NEXT:    v_alignbit_b32 v53, s19, v53, 8
-; SI-NEXT:    v_alignbit_b32 v54, s17, v40, 24
-; SI-NEXT:    v_alignbit_b32 v55, s17, v40, 16
-; SI-NEXT:    v_alignbit_b32 v40, s17, v40, 8
-; SI-NEXT:    s_lshr_b32 s56, s6, 24
-; SI-NEXT:    s_lshr_b32 s57, s6, 16
-; SI-NEXT:    s_lshr_b32 s58, s6, 8
-; SI-NEXT:    s_lshr_b32 s59, s8, 24
-; SI-NEXT:    s_lshr_b32 s60, s8, 16
-; SI-NEXT:    s_lshr_b32 s61, s8, 8
-; SI-NEXT:    s_lshr_b32 s62, s10, 24
-; SI-NEXT:    s_lshr_b32 s63, s10, 16
-; SI-NEXT:    s_lshr_b32 s72, s10, 8
-; SI-NEXT:    s_lshr_b32 s73, s12, 24
-; SI-NEXT:    s_lshr_b32 s74, s12, 16
-; SI-NEXT:    s_lshr_b32 s75, s12, 8
-; SI-NEXT:    s_lshr_b32 s76, s14, 24
-; SI-NEXT:    s_lshr_b32 s77, s14, 16
-; SI-NEXT:    s_lshr_b32 s78, s14, 8
-; SI-NEXT:    s_lshr_b32 s79, s40, 24
-; SI-NEXT:    s_lshr_b32 s88, s40, 16
-; SI-NEXT:    s_lshr_b32 s89, s40, 8
-; SI-NEXT:    s_lshr_b32 s90, s42, 24
-; SI-NEXT:    s_lshr_b32 s91, s42, 16
-; SI-NEXT:    s_lshr_b32 s92, s42, 8
-; SI-NEXT:    s_lshr_b32 s93, s44, 24
-; SI-NEXT:    s_lshr_b32 s94, s44, 16
-; SI-NEXT:    s_lshr_b32 s95, s44, 8
-; SI-NEXT:    s_lshr_b32 s30, s46, 24
-; SI-NEXT:    s_lshr_b32 s31, s46, 16
-; SI-NEXT:    s_lshr_b32 s34, s46, 8
-; SI-NEXT:    s_lshr_b32 s35, s29, 24
-; SI-NEXT:    s_lshr_b32 s36, s29, 16
-; SI-NEXT:    s_lshr_b32 s37, s29, 8
-; SI-NEXT:    s_lshr_b32 s38, s27, 24
-; SI-NEXT:    s_lshr_b32 s39, s27, 16
-; SI-NEXT:    s_lshr_b32 s48, s27, 8
-; SI-NEXT:    s_lshr_b32 s49, s25, 24
-; SI-NEXT:    s_lshr_b32 s50, s25, 16
-; SI-NEXT:    s_lshr_b32 s51, s25, 8
-; SI-NEXT:    s_lshr_b32 s52, s23, 24
-; SI-NEXT:    s_lshr_b32 s53, s23, 16
-; SI-NEXT:    s_lshr_b32 s54, s23, 8
-; SI-NEXT:    s_lshr_b32 s55, s21, 24
-; SI-NEXT:    s_lshr_b32 s64, s21, 16
-; SI-NEXT:    s_lshr_b32 s65, s21, 8
-; SI-NEXT:    s_lshr_b32 s66, s19, 24
-; SI-NEXT:    s_lshr_b32 s67, s19, 16
-; SI-NEXT:    s_lshr_b32 s68, s19, 8
-; SI-NEXT:    s_lshr_b32 s69, s17, 24
-; SI-NEXT:    s_lshr_b32 s70, s17, 16
-; SI-NEXT:    s_lshr_b32 s71, s17, 8
+; SI-NEXT:    s_lshr_b32 s46, s5, 24
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_writelane_b32 v22, s46, 40
+; SI-NEXT:    s_lshr_b32 s46, s5, 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 41
+; SI-NEXT:    s_lshr_b32 s46, s5, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 42
+; SI-NEXT:    s_lshr_b32 s46, s7, 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 43
+; SI-NEXT:    s_lshr_b32 s46, s7, 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 44
+; SI-NEXT:    s_lshr_b32 s46, s7, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 45
+; SI-NEXT:    s_lshr_b32 s46, s9, 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 46
+; SI-NEXT:    s_lshr_b32 s46, s9, 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 47
+; SI-NEXT:    s_lshr_b32 s46, s9, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 48
+; SI-NEXT:    s_lshr_b32 s46, s11, 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 49
+; SI-NEXT:    s_lshr_b32 s46, s11, 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 50
+; SI-NEXT:    s_lshr_b32 s46, s11, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 51
+; SI-NEXT:    s_lshr_b32 s46, s13, 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 52
+; SI-NEXT:    s_lshr_b32 s46, s13, 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 53
+; SI-NEXT:    s_lshr_b32 s46, s13, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 54
+; SI-NEXT:    s_lshr_b32 s46, s15, 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 55
+; SI-NEXT:    s_lshr_b32 s46, s15, 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 56
+; SI-NEXT:    s_lshr_b32 s46, s15, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 57
+; SI-NEXT:    s_lshr_b32 s46, s41, 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 58
+; SI-NEXT:    s_lshr_b32 s46, s41, 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 59
+; SI-NEXT:    s_lshr_b32 s46, s41, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 60
+; SI-NEXT:    s_lshr_b32 s46, s43, 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 61
+; SI-NEXT:    s_lshr_b32 s46, s43, 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 62
+; SI-NEXT:    s_lshr_b32 s46, s43, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 63
+; SI-NEXT:    s_lshr_b32 s46, s45, 24
+; SI-NEXT:    v_writelane_b32 v21, s46, 0
+; SI-NEXT:    s_lshr_b32 s46, s45, 16
+; SI-NEXT:    v_writelane_b32 v21, s46, 1
+; SI-NEXT:    s_lshr_b32 s46, s45, 8
+; SI-NEXT:    v_writelane_b32 v21, s46, 2
+; SI-NEXT:    s_lshr_b32 s46, s29, 24
+; SI-NEXT:    v_writelane_b32 v21, s46, 3
+; SI-NEXT:    s_lshr_b32 s46, s29, 16
+; SI-NEXT:    v_writelane_b32 v21, s46, 4
+; SI-NEXT:    s_lshr_b32 s46, s29, 8
+; SI-NEXT:    v_writelane_b32 v21, s46, 5
+; SI-NEXT:    s_lshr_b32 s46, s27, 24
+; SI-NEXT:    v_writelane_b32 v21, s46, 6
+; SI-NEXT:    s_lshr_b32 s46, s27, 16
+; SI-NEXT:    v_writelane_b32 v21, s46, 7
+; SI-NEXT:    s_lshr_b32 s46, s27, 8
+; SI-NEXT:    v_writelane_b32 v21, s46, 8
+; SI-NEXT:    s_lshr_b32 s46, s25, 24
+; SI-NEXT:    v_writelane_b32 v21, s46, 9
+; SI-NEXT:    s_lshr_b32 s46, s25, 16
+; SI-NEXT:    v_writelane_b32 v21, s46, 10
+; SI-NEXT:    s_lshr_b32 s46, s25, 8
+; SI-NEXT:    v_writelane_b32 v21, s46, 11
+; SI-NEXT:    s_lshr_b32 s46, s23, 24
+; SI-NEXT:    v_writelane_b32 v21, s46, 12
+; SI-NEXT:    s_lshr_b32 s46, s23, 16
+; SI-NEXT:    v_writelane_b32 v21, s46, 13
+; SI-NEXT:    s_lshr_b32 s46, s23, 8
+; SI-NEXT:    v_writelane_b32 v21, s46, 14
+; SI-NEXT:    s_lshr_b32 s46, s21, 24
+; SI-NEXT:    v_writelane_b32 v21, s46, 15
+; SI-NEXT:    s_lshr_b32 s46, s21, 16
+; SI-NEXT:    v_writelane_b32 v21, s46, 16
+; SI-NEXT:    s_lshr_b32 s46, s21, 8
+; SI-NEXT:    v_writelane_b32 v21, s46, 17
+; SI-NEXT:    s_lshr_b64 s[46:47], s[4:5], 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 38
+; SI-NEXT:    v_writelane_b32 v22, s47, 39
+; SI-NEXT:    s_lshr_b64 s[46:47], s[4:5], 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 36
+; SI-NEXT:    v_writelane_b32 v22, s47, 37
+; SI-NEXT:    s_lshr_b64 s[46:47], s[6:7], 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 34
+; SI-NEXT:    v_writelane_b32 v22, s47, 35
+; SI-NEXT:    s_lshr_b64 s[46:47], s[6:7], 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 32
+; SI-NEXT:    v_writelane_b32 v22, s47, 33
+; SI-NEXT:    s_lshr_b64 s[46:47], s[6:7], 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 30
+; SI-NEXT:    v_writelane_b32 v22, s47, 31
+; SI-NEXT:    s_lshr_b64 s[46:47], s[8:9], 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 28
+; SI-NEXT:    v_writelane_b32 v22, s47, 29
+; SI-NEXT:    s_lshr_b64 s[46:47], s[8:9], 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 26
+; SI-NEXT:    v_writelane_b32 v22, s47, 27
+; SI-NEXT:    s_lshr_b64 s[46:47], s[8:9], 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 24
+; SI-NEXT:    v_writelane_b32 v22, s47, 25
+; SI-NEXT:    s_lshr_b64 s[46:47], s[10:11], 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 22
+; SI-NEXT:    v_writelane_b32 v22, s47, 23
+; SI-NEXT:    s_lshr_b64 s[46:47], s[10:11], 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 20
+; SI-NEXT:    v_writelane_b32 v22, s47, 21
+; SI-NEXT:    s_lshr_b64 s[46:47], s[10:11], 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 18
+; SI-NEXT:    v_writelane_b32 v22, s47, 19
+; SI-NEXT:    s_lshr_b64 s[46:47], s[12:13], 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 16
+; SI-NEXT:    v_writelane_b32 v22, s47, 17
+; SI-NEXT:    s_lshr_b64 s[46:47], s[12:13], 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 14
+; SI-NEXT:    v_writelane_b32 v22, s47, 15
+; SI-NEXT:    s_lshr_b64 s[46:47], s[12:13], 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 12
+; SI-NEXT:    v_writelane_b32 v22, s47, 13
+; SI-NEXT:    s_lshr_b64 s[46:47], s[14:15], 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 10
+; SI-NEXT:    v_writelane_b32 v22, s47, 11
+; SI-NEXT:    s_lshr_b64 s[46:47], s[14:15], 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 8
+; SI-NEXT:    v_writelane_b32 v22, s47, 9
+; SI-NEXT:    s_lshr_b64 s[46:47], s[14:15], 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 6
+; SI-NEXT:    v_writelane_b32 v22, s47, 7
+; SI-NEXT:    s_lshr_b64 s[46:47], s[40:41], 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 4
+; SI-NEXT:    v_writelane_b32 v22, s47, 5
+; SI-NEXT:    s_lshr_b64 s[46:47], s[40:41], 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 2
+; SI-NEXT:    v_writelane_b32 v22, s47, 3
+; SI-NEXT:    s_lshr_b64 s[46:47], s[40:41], 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 0
+; SI-NEXT:    s_lshr_b32 s49, s19, 24
+; SI-NEXT:    s_lshr_b32 s48, s19, 16
+; SI-NEXT:    s_lshr_b32 s50, s19, 8
+; SI-NEXT:    s_lshr_b32 s51, s17, 24
+; SI-NEXT:    s_lshr_b32 s52, s17, 16
+; SI-NEXT:    s_lshr_b32 s53, s17, 8
+; SI-NEXT:    s_lshr_b64 s[54:55], s[4:5], 16
+; SI-NEXT:    v_writelane_b32 v22, s47, 1
+; SI-NEXT:    s_lshr_b64 s[64:65], s[42:43], 24
+; SI-NEXT:    s_lshr_b64 s[66:67], s[42:43], 16
+; SI-NEXT:    s_lshr_b64 s[68:69], s[42:43], 8
+; SI-NEXT:    s_lshr_b64 s[70:71], s[44:45], 24
+; SI-NEXT:    s_lshr_b64 s[80:81], s[44:45], 16
+; SI-NEXT:    s_lshr_b64 s[82:83], s[44:45], 8
+; SI-NEXT:    s_lshr_b64 s[84:85], s[28:29], 24
+; SI-NEXT:    s_lshr_b64 s[86:87], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[96:97], s[28:29], 8
+; SI-NEXT:    s_lshr_b64 s[98:99], s[26:27], 24
+; SI-NEXT:    s_lshr_b64 s[46:47], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[26:27], 8
+; SI-NEXT:    s_lshr_b64 s[58:59], s[24:25], 24
+; SI-NEXT:    s_lshr_b64 s[60:61], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[24:25], 8
+; SI-NEXT:    s_lshr_b64 s[72:73], s[22:23], 24
+; SI-NEXT:    s_lshr_b64 s[74:75], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[22:23], 8
+; SI-NEXT:    s_lshr_b64 s[78:79], s[20:21], 24
+; SI-NEXT:    s_lshr_b64 s[88:89], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[90:91], s[20:21], 8
+; SI-NEXT:    s_lshr_b64 s[92:93], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[94:95], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[30:31], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[34:35], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[36:37], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[38:39], s[16:17], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB57_3
 ; SI-NEXT:  .LBB57_2: ; %cmp.true
-; SI-NEXT:    s_add_u32 s16, s16, 3
-; SI-NEXT:    s_addc_u32 s17, s17, 0
-; SI-NEXT:    s_add_u32 s18, s18, 3
-; SI-NEXT:    s_addc_u32 s19, s19, 0
-; SI-NEXT:    s_add_u32 s20, s20, 3
-; SI-NEXT:    s_addc_u32 s21, s21, 0
-; SI-NEXT:    s_add_u32 s22, s22, 3
-; SI-NEXT:    s_addc_u32 s23, s23, 0
-; SI-NEXT:    s_add_u32 s24, s24, 3
-; SI-NEXT:    s_addc_u32 s25, s25, 0
-; SI-NEXT:    s_add_u32 s26, s26, 3
-; SI-NEXT:    s_addc_u32 s27, s27, 0
+; SI-NEXT:    s_add_u32 s4, s4, 3
+; SI-NEXT:    s_addc_u32 s5, s5, 0
+; SI-NEXT:    s_add_u32 s6, s6, 3
+; SI-NEXT:    s_addc_u32 s7, s7, 0
+; SI-NEXT:    s_add_u32 s8, s8, 3
+; SI-NEXT:    s_addc_u32 s9, s9, 0
+; SI-NEXT:    s_add_u32 s10, s10, 3
+; SI-NEXT:    s_addc_u32 s11, s11, 0
+; SI-NEXT:    s_add_u32 s12, s12, 3
+; SI-NEXT:    s_addc_u32 s13, s13, 0
+; SI-NEXT:    s_add_u32 s14, s14, 3
+; SI-NEXT:    s_addc_u32 s15, s15, 0
+; SI-NEXT:    s_add_u32 s40, s40, 3
+; SI-NEXT:    s_addc_u32 s41, s41, 0
+; SI-NEXT:    s_add_u32 s42, s42, 3
+; SI-NEXT:    s_addc_u32 s43, s43, 0
+; SI-NEXT:    s_add_u32 s44, s44, 3
+; SI-NEXT:    s_addc_u32 s45, s45, 0
 ; SI-NEXT:    s_add_u32 s28, s28, 3
 ; SI-NEXT:    s_addc_u32 s29, s29, 0
-; SI-NEXT:    s_add_u32 s47, s47, 3
-; SI-NEXT:    s_addc_u32 s46, s46, 0
-; SI-NEXT:    s_add_u32 s45, s45, 3
-; SI-NEXT:    s_addc_u32 s44, s44, 0
-; SI-NEXT:    s_add_u32 s43, s43, 3
-; SI-NEXT:    s_addc_u32 s42, s42, 0
-; SI-NEXT:    s_add_u32 s41, s41, 3
-; SI-NEXT:    s_addc_u32 s40, s40, 0
-; SI-NEXT:    s_add_u32 s15, s15, 3
-; SI-NEXT:    s_addc_u32 s14, s14, 0
-; SI-NEXT:    s_add_u32 s13, s13, 3
-; SI-NEXT:    s_addc_u32 s12, s12, 0
-; SI-NEXT:    s_add_u32 s11, s11, 3
-; SI-NEXT:    s_addc_u32 s10, s10, 0
-; SI-NEXT:    s_add_u32 s9, s9, 3
-; SI-NEXT:    s_addc_u32 s8, s8, 0
-; SI-NEXT:    s_add_u32 s7, s7, 3
-; SI-NEXT:    v_mov_b32_e32 v22, s45
-; SI-NEXT:    s_addc_u32 s6, s6, 0
-; SI-NEXT:    v_mov_b32_e32 v3, s7
-; SI-NEXT:    v_mov_b32_e32 v6, s9
-; SI-NEXT:    v_mov_b32_e32 v9, s11
-; SI-NEXT:    v_mov_b32_e32 v12, s13
-; SI-NEXT:    v_mov_b32_e32 v15, s15
-; SI-NEXT:    v_mov_b32_e32 v18, s41
-; SI-NEXT:    v_mov_b32_e32 v21, s43
-; SI-NEXT:    v_alignbit_b32 v24, s44, v22, 24
-; SI-NEXT:    v_alignbit_b32 v25, s44, v22, 16
-; SI-NEXT:    v_alignbit_b32 v26, s44, v22, 8
-; SI-NEXT:    v_mov_b32_e32 v22, s47
-; SI-NEXT:    v_mov_b32_e32 v23, s28
-; SI-NEXT:    v_mov_b32_e32 v29, s26
-; SI-NEXT:    v_mov_b32_e32 v35, s24
-; SI-NEXT:    v_mov_b32_e32 v39, s22
-; SI-NEXT:    v_mov_b32_e32 v50, s20
-; SI-NEXT:    v_mov_b32_e32 v53, s18
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v40, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v3, 24
-; SI-NEXT:    v_alignbit_b32 v2, s6, v3, 16
-; SI-NEXT:    v_alignbit_b32 v3, s6, v3, 8
-; SI-NEXT:    v_alignbit_b32 v4, s8, v6, 24
-; SI-NEXT:    v_alignbit_b32 v5, s8, v6, 16
-; SI-NEXT:    v_alignbit_b32 v6, s8, v6, 8
-; SI-NEXT:    v_alignbit_b32 v7, s10, v9, 24
-; SI-NEXT:    v_alignbit_b32 v8, s10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v9, s10, v9, 8
-; SI-NEXT:    v_alignbit_b32 v10, s12, v12, 24
-; SI-NEXT:    v_alignbit_b32 v11, s12, v12, 16
-; SI-NEXT:    v_alignbit_b32 v12, s12, v12, 8
-; SI-NEXT:    v_alignbit_b32 v13, s14, v15, 24
-; SI-NEXT:    v_alignbit_b32 v14, s14, v15, 16
-; SI-NEXT:    v_alignbit_b32 v15, s14, v15, 8
-; SI-NEXT:    v_alignbit_b32 v16, s40, v18, 24
-; SI-NEXT:    v_alignbit_b32 v17, s40, v18, 16
-; SI-NEXT:    v_alignbit_b32 v18, s40, v18, 8
-; SI-NEXT:    v_alignbit_b32 v19, s42, v21, 24
-; SI-NEXT:    v_alignbit_b32 v20, s42, v21, 16
-; SI-NEXT:    v_alignbit_b32 v21, s42, v21, 8
-; SI-NEXT:    v_alignbit_b32 v30, s46, v22, 24
-; SI-NEXT:    v_alignbit_b32 v31, s46, v22, 16
-; SI-NEXT:    v_alignbit_b32 v32, s46, v22, 8
-; SI-NEXT:    v_alignbit_b32 v36, s29, v23, 24
-; SI-NEXT:    v_alignbit_b32 v22, s29, v23, 16
-; SI-NEXT:    v_alignbit_b32 v23, s29, v23, 8
-; SI-NEXT:    v_alignbit_b32 v27, s27, v29, 24
-; SI-NEXT:    v_alignbit_b32 v28, s27, v29, 16
-; SI-NEXT:    v_alignbit_b32 v29, s27, v29, 8
-; SI-NEXT:    v_alignbit_b32 v33, s25, v35, 24
-; SI-NEXT:    v_alignbit_b32 v34, s25, v35, 16
-; SI-NEXT:    v_alignbit_b32 v35, s25, v35, 8
-; SI-NEXT:    v_alignbit_b32 v37, s23, v39, 24
-; SI-NEXT:    v_alignbit_b32 v38, s23, v39, 16
-; SI-NEXT:    v_alignbit_b32 v39, s23, v39, 8
-; SI-NEXT:    v_alignbit_b32 v48, s21, v50, 24
-; SI-NEXT:    v_alignbit_b32 v49, s21, v50, 16
-; SI-NEXT:    v_alignbit_b32 v50, s21, v50, 8
-; SI-NEXT:    v_alignbit_b32 v51, s19, v53, 24
-; SI-NEXT:    v_alignbit_b32 v52, s19, v53, 16
-; SI-NEXT:    v_alignbit_b32 v53, s19, v53, 8
-; SI-NEXT:    v_alignbit_b32 v54, s17, v40, 24
-; SI-NEXT:    v_alignbit_b32 v55, s17, v40, 16
-; SI-NEXT:    v_alignbit_b32 v40, s17, v40, 8
-; SI-NEXT:    s_lshr_b32 s56, s6, 24
-; SI-NEXT:    s_lshr_b32 s57, s6, 16
-; SI-NEXT:    s_lshr_b32 s58, s6, 8
-; SI-NEXT:    s_lshr_b32 s59, s8, 24
-; SI-NEXT:    s_lshr_b32 s60, s8, 16
-; SI-NEXT:    s_lshr_b32 s61, s8, 8
-; SI-NEXT:    s_lshr_b32 s62, s10, 24
-; SI-NEXT:    s_lshr_b32 s63, s10, 16
-; SI-NEXT:    s_lshr_b32 s72, s10, 8
-; SI-NEXT:    s_lshr_b32 s73, s12, 24
-; SI-NEXT:    s_lshr_b32 s74, s12, 16
-; SI-NEXT:    s_lshr_b32 s75, s12, 8
-; SI-NEXT:    s_lshr_b32 s76, s14, 24
-; SI-NEXT:    s_lshr_b32 s77, s14, 16
-; SI-NEXT:    s_lshr_b32 s78, s14, 8
-; SI-NEXT:    s_lshr_b32 s79, s40, 24
-; SI-NEXT:    s_lshr_b32 s88, s40, 16
-; SI-NEXT:    s_lshr_b32 s89, s40, 8
-; SI-NEXT:    s_lshr_b32 s90, s42, 24
-; SI-NEXT:    s_lshr_b32 s91, s42, 16
-; SI-NEXT:    s_lshr_b32 s92, s42, 8
-; SI-NEXT:    s_lshr_b32 s93, s44, 24
-; SI-NEXT:    s_lshr_b32 s94, s44, 16
-; SI-NEXT:    s_lshr_b32 s95, s44, 8
-; SI-NEXT:    s_lshr_b32 s30, s46, 24
-; SI-NEXT:    s_lshr_b32 s31, s46, 16
-; SI-NEXT:    s_lshr_b32 s34, s46, 8
-; SI-NEXT:    s_lshr_b32 s35, s29, 24
-; SI-NEXT:    s_lshr_b32 s36, s29, 16
-; SI-NEXT:    s_lshr_b32 s37, s29, 8
-; SI-NEXT:    s_lshr_b32 s38, s27, 24
-; SI-NEXT:    s_lshr_b32 s39, s27, 16
-; SI-NEXT:    s_lshr_b32 s48, s27, 8
-; SI-NEXT:    s_lshr_b32 s49, s25, 24
-; SI-NEXT:    s_lshr_b32 s50, s25, 16
-; SI-NEXT:    s_lshr_b32 s51, s25, 8
-; SI-NEXT:    s_lshr_b32 s52, s23, 24
-; SI-NEXT:    s_lshr_b32 s53, s23, 16
-; SI-NEXT:    s_lshr_b32 s54, s23, 8
-; SI-NEXT:    s_lshr_b32 s55, s21, 24
-; SI-NEXT:    s_lshr_b32 s64, s21, 16
-; SI-NEXT:    s_lshr_b32 s65, s21, 8
-; SI-NEXT:    s_lshr_b32 s66, s19, 24
-; SI-NEXT:    s_lshr_b32 s67, s19, 16
-; SI-NEXT:    s_lshr_b32 s68, s19, 8
-; SI-NEXT:    s_lshr_b32 s69, s17, 24
-; SI-NEXT:    s_lshr_b32 s70, s17, 16
-; SI-NEXT:    s_lshr_b32 s71, s17, 8
+; SI-NEXT:    s_add_u32 s26, s26, 3
+; SI-NEXT:    s_addc_u32 s27, s27, 0
+; SI-NEXT:    s_add_u32 s24, s24, 3
+; SI-NEXT:    s_addc_u32 s25, s25, 0
+; SI-NEXT:    s_add_u32 s22, s22, 3
+; SI-NEXT:    s_addc_u32 s23, s23, 0
+; SI-NEXT:    s_add_u32 s20, s20, 3
+; SI-NEXT:    s_addc_u32 s21, s21, 0
+; SI-NEXT:    s_add_u32 s18, s18, 3
+; SI-NEXT:    s_addc_u32 s19, s19, 0
+; SI-NEXT:    s_add_u32 s16, s16, 3
+; SI-NEXT:    s_addc_u32 s17, s17, 0
+; SI-NEXT:    s_lshr_b32 s46, s5, 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 40
+; SI-NEXT:    s_lshr_b32 s46, s5, 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 41
+; SI-NEXT:    s_lshr_b32 s46, s5, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 42
+; SI-NEXT:    s_lshr_b32 s46, s7, 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 43
+; SI-NEXT:    s_lshr_b32 s46, s7, 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 44
+; SI-NEXT:    s_lshr_b32 s46, s7, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 45
+; SI-NEXT:    s_lshr_b32 s46, s9, 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 46
+; SI-NEXT:    s_lshr_b32 s46, s9, 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 47
+; SI-NEXT:    s_lshr_b32 s46, s9, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 48
+; SI-NEXT:    s_lshr_b32 s46, s11, 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 49
+; SI-NEXT:    s_lshr_b32 s46, s11, 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 50
+; SI-NEXT:    s_lshr_b32 s46, s11, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 51
+; SI-NEXT:    s_lshr_b32 s46, s13, 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 52
+; SI-NEXT:    s_lshr_b32 s46, s13, 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 53
+; SI-NEXT:    s_lshr_b32 s46, s13, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 54
+; SI-NEXT:    s_lshr_b32 s46, s15, 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 55
+; SI-NEXT:    s_lshr_b32 s46, s15, 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 56
+; SI-NEXT:    s_lshr_b32 s46, s15, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 57
+; SI-NEXT:    s_lshr_b32 s46, s41, 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 58
+; SI-NEXT:    s_lshr_b32 s46, s41, 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 59
+; SI-NEXT:    s_lshr_b32 s46, s41, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 60
+; SI-NEXT:    s_lshr_b32 s46, s43, 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 61
+; SI-NEXT:    s_lshr_b32 s46, s43, 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 62
+; SI-NEXT:    s_lshr_b32 s46, s43, 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 63
+; SI-NEXT:    s_lshr_b32 s46, s45, 24
+; SI-NEXT:    v_writelane_b32 v21, s46, 0
+; SI-NEXT:    s_lshr_b32 s46, s45, 16
+; SI-NEXT:    v_writelane_b32 v21, s46, 1
+; SI-NEXT:    s_lshr_b32 s46, s45, 8
+; SI-NEXT:    v_writelane_b32 v21, s46, 2
+; SI-NEXT:    s_lshr_b32 s46, s29, 24
+; SI-NEXT:    v_writelane_b32 v21, s46, 3
+; SI-NEXT:    s_lshr_b32 s46, s29, 16
+; SI-NEXT:    v_writelane_b32 v21, s46, 4
+; SI-NEXT:    s_lshr_b32 s46, s29, 8
+; SI-NEXT:    v_writelane_b32 v21, s46, 5
+; SI-NEXT:    s_lshr_b32 s46, s27, 24
+; SI-NEXT:    v_writelane_b32 v21, s46, 6
+; SI-NEXT:    s_lshr_b32 s46, s27, 16
+; SI-NEXT:    v_writelane_b32 v21, s46, 7
+; SI-NEXT:    s_lshr_b32 s46, s27, 8
+; SI-NEXT:    v_writelane_b32 v21, s46, 8
+; SI-NEXT:    s_lshr_b32 s46, s25, 24
+; SI-NEXT:    v_writelane_b32 v21, s46, 9
+; SI-NEXT:    s_lshr_b32 s46, s25, 16
+; SI-NEXT:    v_writelane_b32 v21, s46, 10
+; SI-NEXT:    s_lshr_b32 s46, s25, 8
+; SI-NEXT:    v_writelane_b32 v21, s46, 11
+; SI-NEXT:    s_lshr_b32 s46, s23, 24
+; SI-NEXT:    v_writelane_b32 v21, s46, 12
+; SI-NEXT:    s_lshr_b32 s46, s23, 16
+; SI-NEXT:    v_writelane_b32 v21, s46, 13
+; SI-NEXT:    s_lshr_b32 s46, s23, 8
+; SI-NEXT:    v_writelane_b32 v21, s46, 14
+; SI-NEXT:    s_lshr_b32 s46, s21, 24
+; SI-NEXT:    v_writelane_b32 v21, s46, 15
+; SI-NEXT:    s_lshr_b32 s46, s21, 16
+; SI-NEXT:    v_writelane_b32 v21, s46, 16
+; SI-NEXT:    s_lshr_b32 s46, s21, 8
+; SI-NEXT:    v_writelane_b32 v21, s46, 17
+; SI-NEXT:    s_lshr_b64 s[46:47], s[4:5], 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 38
+; SI-NEXT:    v_writelane_b32 v22, s47, 39
+; SI-NEXT:    s_lshr_b64 s[46:47], s[4:5], 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 36
+; SI-NEXT:    v_writelane_b32 v22, s47, 37
+; SI-NEXT:    s_lshr_b64 s[46:47], s[6:7], 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 34
+; SI-NEXT:    v_writelane_b32 v22, s47, 35
+; SI-NEXT:    s_lshr_b64 s[46:47], s[6:7], 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 32
+; SI-NEXT:    v_writelane_b32 v22, s47, 33
+; SI-NEXT:    s_lshr_b64 s[46:47], s[6:7], 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 30
+; SI-NEXT:    v_writelane_b32 v22, s47, 31
+; SI-NEXT:    s_lshr_b64 s[46:47], s[8:9], 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 28
+; SI-NEXT:    v_writelane_b32 v22, s47, 29
+; SI-NEXT:    s_lshr_b64 s[46:47], s[8:9], 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 26
+; SI-NEXT:    v_writelane_b32 v22, s47, 27
+; SI-NEXT:    s_lshr_b64 s[46:47], s[8:9], 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 24
+; SI-NEXT:    v_writelane_b32 v22, s47, 25
+; SI-NEXT:    s_lshr_b64 s[46:47], s[10:11], 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 22
+; SI-NEXT:    v_writelane_b32 v22, s47, 23
+; SI-NEXT:    s_lshr_b64 s[46:47], s[10:11], 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 20
+; SI-NEXT:    v_writelane_b32 v22, s47, 21
+; SI-NEXT:    s_lshr_b64 s[46:47], s[10:11], 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 18
+; SI-NEXT:    v_writelane_b32 v22, s47, 19
+; SI-NEXT:    s_lshr_b64 s[46:47], s[12:13], 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 16
+; SI-NEXT:    v_writelane_b32 v22, s47, 17
+; SI-NEXT:    s_lshr_b64 s[46:47], s[12:13], 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 14
+; SI-NEXT:    v_writelane_b32 v22, s47, 15
+; SI-NEXT:    s_lshr_b64 s[46:47], s[12:13], 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 12
+; SI-NEXT:    v_writelane_b32 v22, s47, 13
+; SI-NEXT:    s_lshr_b64 s[46:47], s[14:15], 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 10
+; SI-NEXT:    v_writelane_b32 v22, s47, 11
+; SI-NEXT:    s_lshr_b64 s[46:47], s[14:15], 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 8
+; SI-NEXT:    v_writelane_b32 v22, s47, 9
+; SI-NEXT:    s_lshr_b64 s[46:47], s[14:15], 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 6
+; SI-NEXT:    v_writelane_b32 v22, s47, 7
+; SI-NEXT:    s_lshr_b64 s[46:47], s[40:41], 24
+; SI-NEXT:    v_writelane_b32 v22, s46, 4
+; SI-NEXT:    v_writelane_b32 v22, s47, 5
+; SI-NEXT:    s_lshr_b64 s[46:47], s[40:41], 16
+; SI-NEXT:    v_writelane_b32 v22, s46, 2
+; SI-NEXT:    v_writelane_b32 v22, s47, 3
+; SI-NEXT:    s_lshr_b64 s[46:47], s[40:41], 8
+; SI-NEXT:    v_writelane_b32 v22, s46, 0
+; SI-NEXT:    s_lshr_b32 s49, s19, 24
+; SI-NEXT:    s_lshr_b32 s48, s19, 16
+; SI-NEXT:    s_lshr_b32 s50, s19, 8
+; SI-NEXT:    s_lshr_b32 s51, s17, 24
+; SI-NEXT:    s_lshr_b32 s52, s17, 16
+; SI-NEXT:    s_lshr_b32 s53, s17, 8
+; SI-NEXT:    s_lshr_b64 s[54:55], s[4:5], 16
+; SI-NEXT:    v_writelane_b32 v22, s47, 1
+; SI-NEXT:    s_lshr_b64 s[64:65], s[42:43], 24
+; SI-NEXT:    s_lshr_b64 s[66:67], s[42:43], 16
+; SI-NEXT:    s_lshr_b64 s[68:69], s[42:43], 8
+; SI-NEXT:    s_lshr_b64 s[70:71], s[44:45], 24
+; SI-NEXT:    s_lshr_b64 s[80:81], s[44:45], 16
+; SI-NEXT:    s_lshr_b64 s[82:83], s[44:45], 8
+; SI-NEXT:    s_lshr_b64 s[84:85], s[28:29], 24
+; SI-NEXT:    s_lshr_b64 s[86:87], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[96:97], s[28:29], 8
+; SI-NEXT:    s_lshr_b64 s[98:99], s[26:27], 24
+; SI-NEXT:    s_lshr_b64 s[46:47], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[26:27], 8
+; SI-NEXT:    s_lshr_b64 s[58:59], s[24:25], 24
+; SI-NEXT:    s_lshr_b64 s[60:61], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[24:25], 8
+; SI-NEXT:    s_lshr_b64 s[72:73], s[22:23], 24
+; SI-NEXT:    s_lshr_b64 s[74:75], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[22:23], 8
+; SI-NEXT:    s_lshr_b64 s[78:79], s[20:21], 24
+; SI-NEXT:    s_lshr_b64 s[88:89], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[90:91], s[20:21], 8
+; SI-NEXT:    s_lshr_b64 s[92:93], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[94:95], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[30:31], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[34:35], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[36:37], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[38:39], s[16:17], 8
 ; SI-NEXT:  .LBB57_3: ; %end
-; SI-NEXT:    s_and_b32 s4, s16, 0xff
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 8, v40
-; SI-NEXT:    v_or_b32_e32 v40, s4, v40
-; SI-NEXT:    s_and_b32 s4, s17, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s71, 8
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s70, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s69, 24
-; SI-NEXT:    v_and_b32_e32 v55, 0xff, v55
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
-; SI-NEXT:    v_lshlrev_b32_e32 v55, 16, v55
-; SI-NEXT:    v_lshlrev_b32_e32 v54, 24, v54
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_or_b32_e32 v54, v54, v55
-; SI-NEXT:    v_mov_b32_e32 v55, s4
-; SI-NEXT:    s_and_b32 s4, s18, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v53, 8, v53
-; SI-NEXT:    v_or_b32_e32 v53, s4, v53
-; SI-NEXT:    s_and_b32 s4, s19, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s68, 8
-; SI-NEXT:    v_and_b32_e32 v52, 0xff, v52
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s67, 0xff
-; SI-NEXT:    v_and_b32_e32 v40, 0xffff, v40
-; SI-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; SI-NEXT:    v_lshlrev_b32_e32 v51, 24, v51
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s66, 24
-; SI-NEXT:    v_or_b32_e32 v54, v40, v54
-; SI-NEXT:    v_and_b32_e32 v53, 0xffff, v53
-; SI-NEXT:    v_or_b32_e32 v51, v51, v52
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
-; SI-NEXT:    buffer_store_dword v54, v0, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v54, vcc, 4, v0
-; SI-NEXT:    v_or_b32_e32 v51, v53, v51
-; SI-NEXT:    v_add_i32_e32 v52, vcc, 8, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v55, v54, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v51, v52, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v52, s4
-; SI-NEXT:    s_and_b32 s4, s20, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v50, 8, v50
-; SI-NEXT:    v_or_b32_e32 v50, s4, v50
-; SI-NEXT:    s_and_b32 s4, s21, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s65, 8
-; SI-NEXT:    v_and_b32_e32 v49, 0xff, v49
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s64, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v49, 16, v49
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 24, v48
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s55, 24
-; SI-NEXT:    v_and_b32_e32 v50, 0xffff, v50
-; SI-NEXT:    v_or_b32_e32 v48, v48, v49
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
+; SI-NEXT:    s_lshl_b32 s47, s38, 8
+; SI-NEXT:    s_and_b32 s16, s16, 0xff
+; SI-NEXT:    s_or_b32 s16, s16, s47
+; SI-NEXT:    s_and_b32 s47, s36, 0xff
+; SI-NEXT:    s_lshl_b32 s57, s34, 24
+; SI-NEXT:    s_lshl_b32 s47, s47, 16
+; SI-NEXT:    s_or_b32 s47, s57, s47
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s47
+; SI-NEXT:    v_mov_b32_e32 v1, s16
+; SI-NEXT:    s_and_b32 s16, s17, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s53, 8
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    s_and_b32 s17, s52, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_lshl_b32 s47, s51, 24
+; SI-NEXT:    s_or_b32 s17, s47, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_lshl_b32 s16, s30, 8
+; SI-NEXT:    s_and_b32 s17, s18, 0xff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_and_b32 s17, s94, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s92, 24
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v3, s16
+; SI-NEXT:    s_and_b32 s16, s19, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s50, 8
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    s_and_b32 s17, s48, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_lshl_b32 s18, s49, 24
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v4, s16
+; SI-NEXT:    s_lshl_b32 s16, s90, 8
+; SI-NEXT:    s_and_b32 s17, s20, 0xff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_and_b32 s17, s88, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s78, 24
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v21, 17
+; SI-NEXT:    v_mov_b32_e32 v5, s16
+; SI-NEXT:    s_and_b32 s16, s21, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v21, 16
+; SI-NEXT:    s_and_b32 s17, s17, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v21, 15
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_lshl_b32 s18, s18, 24
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v6, s16
+; SI-NEXT:    s_lshl_b32 s16, s76, 8
+; SI-NEXT:    s_and_b32 s17, s22, 0xff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_and_b32 s17, s74, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s72, 24
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v21, 14
+; SI-NEXT:    v_mov_b32_e32 v7, s16
+; SI-NEXT:    s_and_b32 s16, s23, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v21, 13
+; SI-NEXT:    s_and_b32 s17, s17, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v21, 12
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_lshl_b32 s18, s18, 24
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v8, s16
+; SI-NEXT:    s_lshl_b32 s16, s62, 8
+; SI-NEXT:    s_and_b32 s17, s24, 0xff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_and_b32 s17, s60, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s58, 24
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v21, 11
+; SI-NEXT:    v_mov_b32_e32 v9, s16
+; SI-NEXT:    s_and_b32 s16, s25, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v21, 10
+; SI-NEXT:    s_and_b32 s17, s17, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v21, 9
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_lshl_b32 s18, s18, 24
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v10, s16
+; SI-NEXT:    s_lshl_b32 s16, s56, 8
+; SI-NEXT:    s_and_b32 s17, s26, 0xff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_and_b32 s17, s46, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s98, 24
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v21, 8
+; SI-NEXT:    v_mov_b32_e32 v11, s16
+; SI-NEXT:    s_and_b32 s16, s27, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v21, 7
+; SI-NEXT:    s_and_b32 s17, s17, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v21, 6
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_lshl_b32 s18, s18, 24
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v12, s16
+; SI-NEXT:    s_lshl_b32 s16, s96, 8
+; SI-NEXT:    s_and_b32 s17, s28, 0xff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_and_b32 s17, s86, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s84, 24
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v21, 5
+; SI-NEXT:    v_mov_b32_e32 v13, s16
+; SI-NEXT:    s_and_b32 s16, s29, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v21, 4
+; SI-NEXT:    s_and_b32 s17, s17, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v21, 3
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_lshl_b32 s18, s18, 24
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v14, s16
+; SI-NEXT:    s_lshl_b32 s16, s82, 8
+; SI-NEXT:    s_and_b32 s17, s44, 0xff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_and_b32 s17, s80, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s70, 24
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v21, 2
+; SI-NEXT:    v_mov_b32_e32 v15, s16
+; SI-NEXT:    s_and_b32 s16, s45, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v21, 1
+; SI-NEXT:    s_and_b32 s17, s17, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v21, 0
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v51, vcc, 12, v0
-; SI-NEXT:    v_or_b32_e32 v48, v50, v48
-; SI-NEXT:    v_add_i32_e32 v49, vcc, 16, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v52, v51, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v48, v49, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v49, s4
-; SI-NEXT:    s_and_b32 s4, s22, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v39, 8, v39
-; SI-NEXT:    v_or_b32_e32 v39, s4, v39
-; SI-NEXT:    s_and_b32 s4, s23, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s54, 8
-; SI-NEXT:    v_and_b32_e32 v38, 0xff, v38
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s53, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v38
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 24, v37
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s52, 24
-; SI-NEXT:    v_and_b32_e32 v39, 0xffff, v39
-; SI-NEXT:    v_or_b32_e32 v37, v37, v38
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_lshl_b32 s18, s18, 24
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; SI-NEXT:    v_mov_b32_e32 v16, s16
+; SI-NEXT:    s_lshl_b32 s16, s68, 8
+; SI-NEXT:    s_and_b32 s17, s42, 0xff
+; SI-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 20, v0
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_and_b32 s17, s66, 0xff
+; SI-NEXT:    buffer_store_dword v6, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 24, v0
+; SI-NEXT:    s_lshl_b32 s18, s64, 24
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 36, v0
+; SI-NEXT:    v_readlane_b32 s17, v22, 63
+; SI-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 40, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s43, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 44, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v22, 62
+; SI-NEXT:    buffer_store_dword v12, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 48, v0
+; SI-NEXT:    s_and_b32 s17, s17, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v22, 61
+; SI-NEXT:    buffer_store_dword v13, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 52, v0
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_lshl_b32 s18, s18, 24
+; SI-NEXT:    buffer_store_dword v14, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 56, v0
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    v_readlane_b32 s18, v22, 0
+; SI-NEXT:    buffer_store_dword v15, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 60, v0
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    v_readlane_b32 s19, v22, 1
+; SI-NEXT:    buffer_store_dword v16, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 64, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    s_lshl_b32 s17, s18, 8
+; SI-NEXT:    v_readlane_b32 s18, v22, 2
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v48, vcc, 20, v0
-; SI-NEXT:    v_or_b32_e32 v37, v39, v37
-; SI-NEXT:    v_add_i32_e32 v38, vcc, 24, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v49, v48, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v37, v38, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v38, s4
-; SI-NEXT:    s_and_b32 s4, s24, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v35, 8, v35
-; SI-NEXT:    v_or_b32_e32 v35, s4, v35
-; SI-NEXT:    s_and_b32 s4, s25, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s51, 8
-; SI-NEXT:    v_and_b32_e32 v34, 0xff, v34
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s50, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 24, v33
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s49, 24
-; SI-NEXT:    v_and_b32_e32 v35, 0xffff, v35
-; SI-NEXT:    v_or_b32_e32 v33, v33, v34
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s40, 0xff
+; SI-NEXT:    v_readlane_b32 s19, v22, 3
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    s_and_b32 s17, s18, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v22, 4
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_lshl_b32 s18, s18, 24
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x44, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v22, 60
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v37, vcc, 28, v0
-; SI-NEXT:    v_or_b32_e32 v33, v35, v33
-; SI-NEXT:    v_add_i32_e32 v34, vcc, 32, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v38, v37, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v33, v34, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v34, s4
-; SI-NEXT:    s_and_b32 s4, s26, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 8, v29
-; SI-NEXT:    v_or_b32_e32 v29, s4, v29
-; SI-NEXT:    s_and_b32 s4, s27, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s48, 8
-; SI-NEXT:    v_and_b32_e32 v28, 0xff, v28
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s39, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 24, v27
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s38, 24
-; SI-NEXT:    v_and_b32_e32 v29, 0xffff, v29
-; SI-NEXT:    v_or_b32_e32 v27, v27, v28
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s41, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v22, 59
+; SI-NEXT:    s_and_b32 s17, s17, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v22, 58
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_lshl_b32 s18, s18, 24
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x48, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v33, vcc, 36, v0
-; SI-NEXT:    v_or_b32_e32 v27, v29, v27
-; SI-NEXT:    v_add_i32_e32 v28, vcc, 40, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v34, v33, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v27, v28, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v28, s4
-; SI-NEXT:    s_and_b32 s4, s28, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 8, v23
-; SI-NEXT:    v_or_b32_e32 v23, s4, v23
-; SI-NEXT:    s_and_b32 s4, s29, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s37, 8
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    v_readlane_b32 s16, v22, 6
+; SI-NEXT:    s_and_b32 s14, s14, 0xff
+; SI-NEXT:    v_readlane_b32 s17, v22, 7
+; SI-NEXT:    s_lshl_b32 s16, s16, 8
+; SI-NEXT:    v_readlane_b32 s19, v22, 5
+; SI-NEXT:    s_or_b32 s14, s14, s16
+; SI-NEXT:    v_readlane_b32 s16, v22, 8
+; SI-NEXT:    v_readlane_b32 s17, v22, 9
+; SI-NEXT:    s_and_b32 s16, s16, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v22, 10
+; SI-NEXT:    s_lshl_b32 s16, s16, 16
+; SI-NEXT:    s_lshl_b32 s17, s18, 24
+; SI-NEXT:    s_and_b32 s14, s14, 0xffff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x4c, v0
+; SI-NEXT:    s_or_b32 s14, s14, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v27, vcc, 44, v0
-; SI-NEXT:    v_and_b32_e32 v22, 0xff, v22
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s36, 0xff
-; SI-NEXT:    buffer_store_dword v28, v27, s[0:3], 0 offen
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 24, v36
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s35, 24
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; SI-NEXT:    v_or_b32_e32 v22, v27, v22
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
-; SI-NEXT:    v_or_b32_e32 v22, v23, v22
-; SI-NEXT:    v_add_i32_e32 v23, vcc, 48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v22, v23, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s14
+; SI-NEXT:    s_and_b32 s14, s15, 0xff
+; SI-NEXT:    v_readlane_b32 s15, v22, 57
+; SI-NEXT:    s_lshl_b32 s15, s15, 8
+; SI-NEXT:    s_or_b32 s14, s14, s15
+; SI-NEXT:    v_readlane_b32 s15, v22, 56
+; SI-NEXT:    s_and_b32 s15, s15, 0xff
+; SI-NEXT:    v_readlane_b32 s16, v22, 55
+; SI-NEXT:    s_lshl_b32 s15, s15, 16
+; SI-NEXT:    s_lshl_b32 s16, s16, 24
+; SI-NEXT:    s_and_b32 s14, s14, 0xffff
+; SI-NEXT:    s_or_b32 s15, s16, s15
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x50, v0
+; SI-NEXT:    s_or_b32 s14, s14, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v22, vcc, 52, v0
-; SI-NEXT:    v_mov_b32_e32 v23, s4
-; SI-NEXT:    buffer_store_dword v23, v22, s[0:3], 0 offen
-; SI-NEXT:    s_and_b32 s4, s47, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 8, v32
-; SI-NEXT:    v_or_b32_e32 v22, s4, v22
-; SI-NEXT:    s_and_b32 s4, s46, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s34, 8
+; SI-NEXT:    v_mov_b32_e32 v2, s14
+; SI-NEXT:    v_readlane_b32 s14, v22, 12
+; SI-NEXT:    s_and_b32 s12, s12, 0xff
+; SI-NEXT:    v_readlane_b32 s15, v22, 13
+; SI-NEXT:    s_lshl_b32 s14, s14, 8
+; SI-NEXT:    s_or_b32 s12, s12, s14
+; SI-NEXT:    v_readlane_b32 s14, v22, 14
+; SI-NEXT:    v_readlane_b32 s15, v22, 15
+; SI-NEXT:    s_and_b32 s14, s14, 0xff
+; SI-NEXT:    v_readlane_b32 s16, v22, 16
+; SI-NEXT:    s_lshl_b32 s14, s14, 16
+; SI-NEXT:    s_lshl_b32 s15, s16, 24
+; SI-NEXT:    s_and_b32 s12, s12, 0xffff
+; SI-NEXT:    s_or_b32 s14, s15, s14
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x54, v0
+; SI-NEXT:    s_or_b32 s12, s12, s14
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v23, 0xff, v31
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s31, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 24, v30
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s30, 24
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; SI-NEXT:    v_or_b32_e32 v23, v27, v23
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
-; SI-NEXT:    v_or_b32_e32 v22, v22, v23
-; SI-NEXT:    v_add_i32_e32 v23, vcc, 56, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v22, v23, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    s_and_b32 s12, s13, 0xff
+; SI-NEXT:    v_readlane_b32 s13, v22, 54
+; SI-NEXT:    s_lshl_b32 s13, s13, 8
+; SI-NEXT:    s_or_b32 s12, s12, s13
+; SI-NEXT:    v_readlane_b32 s13, v22, 53
+; SI-NEXT:    s_and_b32 s13, s13, 0xff
+; SI-NEXT:    v_readlane_b32 s14, v22, 52
+; SI-NEXT:    s_lshl_b32 s13, s13, 16
+; SI-NEXT:    s_lshl_b32 s14, s14, 24
+; SI-NEXT:    s_and_b32 s12, s12, 0xffff
+; SI-NEXT:    s_or_b32 s13, s14, s13
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x58, v0
+; SI-NEXT:    s_or_b32 s12, s12, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v22, vcc, 60, v0
-; SI-NEXT:    v_mov_b32_e32 v23, s4
-; SI-NEXT:    buffer_store_dword v23, v22, s[0:3], 0 offen
-; SI-NEXT:    s_and_b32 s4, s45, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 8, v26
-; SI-NEXT:    v_or_b32_e32 v22, s4, v22
-; SI-NEXT:    s_and_b32 s4, s44, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s95, 8
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    v_readlane_b32 s12, v22, 18
+; SI-NEXT:    s_and_b32 s10, s10, 0xff
+; SI-NEXT:    v_readlane_b32 s13, v22, 19
+; SI-NEXT:    s_lshl_b32 s12, s12, 8
+; SI-NEXT:    s_or_b32 s10, s10, s12
+; SI-NEXT:    v_readlane_b32 s12, v22, 20
+; SI-NEXT:    v_readlane_b32 s13, v22, 21
+; SI-NEXT:    s_and_b32 s12, s12, 0xff
+; SI-NEXT:    v_readlane_b32 s14, v22, 22
+; SI-NEXT:    s_lshl_b32 s12, s12, 16
+; SI-NEXT:    s_lshl_b32 s13, s14, 24
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_or_b32 s12, s13, s12
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x5c, v0
+; SI-NEXT:    s_or_b32 s10, s10, s12
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v23, 0xff, v25
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s94, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 24, v24
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s93, 24
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; SI-NEXT:    v_or_b32_e32 v23, v24, v23
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
-; SI-NEXT:    v_or_b32_e32 v22, v22, v23
-; SI-NEXT:    v_add_i32_e32 v23, vcc, 64, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v22, v23, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v23, s4
-; SI-NEXT:    s_and_b32 s4, s43, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 8, v21
-; SI-NEXT:    v_or_b32_e32 v21, s4, v21
-; SI-NEXT:    s_and_b32 s4, s42, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s92, 8
-; SI-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s91, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 24, v19
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s90, 24
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s10, s11, 0xff
+; SI-NEXT:    v_readlane_b32 s11, v22, 51
+; SI-NEXT:    s_lshl_b32 s11, s11, 8
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    v_readlane_b32 s11, v22, 50
+; SI-NEXT:    s_and_b32 s11, s11, 0xff
+; SI-NEXT:    v_readlane_b32 s12, v22, 49
+; SI-NEXT:    s_lshl_b32 s11, s11, 16
+; SI-NEXT:    s_lshl_b32 s12, s12, 24
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_or_b32 s11, s12, s11
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x60, v0
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v22, vcc, 0x44, v0
-; SI-NEXT:    v_or_b32_e32 v19, v21, v19
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 0x48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v23, v22, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v20, s4
-; SI-NEXT:    s_and_b32 s4, s41, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
-; SI-NEXT:    v_or_b32_e32 v18, s4, v18
-; SI-NEXT:    s_and_b32 s4, s40, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s89, 8
-; SI-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s88, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 24, v16
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s79, 24
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; SI-NEXT:    v_or_b32_e32 v16, v16, v17
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    v_readlane_b32 s10, v22, 24
+; SI-NEXT:    s_and_b32 s8, s8, 0xff
+; SI-NEXT:    v_readlane_b32 s11, v22, 25
+; SI-NEXT:    s_lshl_b32 s10, s10, 8
+; SI-NEXT:    s_or_b32 s8, s8, s10
+; SI-NEXT:    v_readlane_b32 s10, v22, 26
+; SI-NEXT:    v_readlane_b32 s11, v22, 27
+; SI-NEXT:    s_and_b32 s10, s10, 0xff
+; SI-NEXT:    v_readlane_b32 s12, v22, 28
+; SI-NEXT:    s_lshl_b32 s10, s10, 16
+; SI-NEXT:    s_lshl_b32 s11, s12, 24
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    s_or_b32 s10, s11, s10
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x64, v0
+; SI-NEXT:    s_or_b32 s8, s8, s10
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 0x4c, v0
-; SI-NEXT:    v_or_b32_e32 v16, v18, v16
-; SI-NEXT:    v_add_i32_e32 v17, vcc, 0x50, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v20, v19, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v17, s4
-; SI-NEXT:    s_and_b32 s4, s15, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
-; SI-NEXT:    v_or_b32_e32 v15, s4, v15
-; SI-NEXT:    s_and_b32 s4, s14, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s78, 8
-; SI-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s77, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s14, s76, 24
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; SI-NEXT:    v_or_b32_e32 v13, v13, v14
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s14, s5
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s8, s9, 0xff
+; SI-NEXT:    v_readlane_b32 s9, v22, 48
+; SI-NEXT:    s_lshl_b32 s9, s9, 8
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    v_readlane_b32 s9, v22, 47
+; SI-NEXT:    s_and_b32 s9, s9, 0xff
+; SI-NEXT:    v_readlane_b32 s10, v22, 46
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s10, s10, 24
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    s_or_b32 s9, s10, s9
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x68, v0
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v16, vcc, 0x54, v0
-; SI-NEXT:    v_or_b32_e32 v13, v15, v13
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 0x58, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v17, v16, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v14, s4
-; SI-NEXT:    s_and_b32 s4, s13, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
-; SI-NEXT:    v_or_b32_e32 v12, s4, v12
-; SI-NEXT:    s_and_b32 s4, s12, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s75, 8
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s74, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s12, s73, 24
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; SI-NEXT:    v_or_b32_e32 v10, v10, v11
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s12, s5
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_readlane_b32 s8, v22, 30
+; SI-NEXT:    s_and_b32 s6, s6, 0xff
+; SI-NEXT:    v_readlane_b32 s9, v22, 31
+; SI-NEXT:    s_lshl_b32 s8, s8, 8
+; SI-NEXT:    s_or_b32 s6, s6, s8
+; SI-NEXT:    v_readlane_b32 s8, v22, 32
+; SI-NEXT:    v_readlane_b32 s9, v22, 33
+; SI-NEXT:    s_and_b32 s8, s8, 0xff
+; SI-NEXT:    v_readlane_b32 s10, v22, 34
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_lshl_b32 s9, s10, 24
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_or_b32 s8, s9, s8
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x6c, v0
+; SI-NEXT:    s_or_b32 s6, s6, s8
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 0x5c, v0
-; SI-NEXT:    v_or_b32_e32 v10, v12, v10
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 0x60, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v14, v13, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v11, s4
-; SI-NEXT:    s_and_b32 s4, s11, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; SI-NEXT:    v_or_b32_e32 v9, s4, v9
-; SI-NEXT:    s_and_b32 s4, s10, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s72, 8
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s63, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s10, s62, 24
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s10, s5
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s6, s7, 0xff
+; SI-NEXT:    v_readlane_b32 s7, v22, 45
+; SI-NEXT:    s_lshl_b32 s7, s7, 8
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    v_readlane_b32 s7, v22, 44
+; SI-NEXT:    s_and_b32 s7, s7, 0xff
+; SI-NEXT:    v_readlane_b32 s8, v22, 43
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_lshl_b32 s8, s8, 24
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_or_b32 s7, s8, s7
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x70, v0
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 0x64, v0
-; SI-NEXT:    v_or_b32_e32 v7, v9, v7
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x68, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v11, v10, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v8, s4
-; SI-NEXT:    s_and_b32 s4, s9, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
-; SI-NEXT:    v_or_b32_e32 v6, s4, v6
-; SI-NEXT:    s_and_b32 s4, s8, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s61, 8
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s60, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s8, s59, 24
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; SI-NEXT:    v_or_b32_e32 v4, v4, v5
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    v_readlane_b32 s6, v22, 36
+; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    s_lshl_b32 s6, s6, 8
+; SI-NEXT:    v_readlane_b32 s7, v22, 37
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    s_and_b32 s6, s54, 0xff
+; SI-NEXT:    v_readlane_b32 s8, v22, 38
+; SI-NEXT:    s_lshl_b32 s6, s6, 16
+; SI-NEXT:    s_lshl_b32 s7, s8, 24
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s8, s5
+; SI-NEXT:    s_or_b32 s6, s7, s6
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x74, v0
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x6c, v0
-; SI-NEXT:    v_or_b32_e32 v4, v6, v4
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x70, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v8, v7, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v5, s4
-; SI-NEXT:    s_and_b32 s4, s7, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s6, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s58, 8
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    s_and_b32 s4, s5, 0xff
+; SI-NEXT:    v_readlane_b32 s5, v22, 42
+; SI-NEXT:    s_lshl_b32 s5, s5, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s57, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; SI-NEXT:    v_readlane_b32 s5, v22, 41
+; SI-NEXT:    s_and_b32 s5, s5, 0xff
+; SI-NEXT:    v_readlane_b32 s6, v22, 40
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s6, s56, 24
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    s_lshl_b32 s6, s6, 24
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s6, s5
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x74, v0
-; SI-NEXT:    v_or_b32_e32 v1, v3, v1
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x78, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    v_readlane_b32 s19, v22, 11
+; SI-NEXT:    v_readlane_b32 s17, v22, 17
+; SI-NEXT:    v_readlane_b32 s15, v22, 23
+; SI-NEXT:    v_readlane_b32 s13, v22, 29
+; SI-NEXT:    v_readlane_b32 s11, v22, 35
+; SI-NEXT:    v_readlane_b32 s9, v22, 39
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    v_readlane_b32 s71, v41, 23
-; SI-NEXT:    v_readlane_b32 s70, v41, 22
-; SI-NEXT:    v_readlane_b32 s69, v41, 21
-; SI-NEXT:    v_readlane_b32 s68, v41, 20
-; SI-NEXT:    v_readlane_b32 s67, v41, 19
-; SI-NEXT:    v_readlane_b32 s66, v41, 18
-; SI-NEXT:    v_readlane_b32 s65, v41, 17
-; SI-NEXT:    v_readlane_b32 s64, v41, 16
-; SI-NEXT:    v_readlane_b32 s55, v41, 15
-; SI-NEXT:    v_readlane_b32 s54, v41, 14
-; SI-NEXT:    v_readlane_b32 s53, v41, 13
-; SI-NEXT:    v_readlane_b32 s52, v41, 12
-; SI-NEXT:    v_readlane_b32 s51, v41, 11
-; SI-NEXT:    v_readlane_b32 s50, v41, 10
-; SI-NEXT:    v_readlane_b32 s49, v41, 9
-; SI-NEXT:    v_readlane_b32 s48, v41, 8
-; SI-NEXT:    v_readlane_b32 s39, v41, 7
-; SI-NEXT:    v_readlane_b32 s38, v41, 6
-; SI-NEXT:    v_readlane_b32 s37, v41, 5
-; SI-NEXT:    v_readlane_b32 s36, v41, 4
-; SI-NEXT:    v_readlane_b32 s35, v41, 3
-; SI-NEXT:    v_readlane_b32 s34, v41, 2
-; SI-NEXT:    v_readlane_b32 s31, v41, 1
-; SI-NEXT:    v_readlane_b32 s30, v41, 0
-; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    v_readlane_b32 s99, v20, 35
+; SI-NEXT:    v_readlane_b32 s98, v20, 34
+; SI-NEXT:    v_readlane_b32 s97, v20, 33
+; SI-NEXT:    v_readlane_b32 s96, v20, 32
+; SI-NEXT:    v_readlane_b32 s87, v20, 31
+; SI-NEXT:    v_readlane_b32 s86, v20, 30
+; SI-NEXT:    v_readlane_b32 s85, v20, 29
+; SI-NEXT:    v_readlane_b32 s84, v20, 28
+; SI-NEXT:    v_readlane_b32 s83, v20, 27
+; SI-NEXT:    v_readlane_b32 s82, v20, 26
+; SI-NEXT:    v_readlane_b32 s81, v20, 25
+; SI-NEXT:    v_readlane_b32 s80, v20, 24
+; SI-NEXT:    v_readlane_b32 s71, v20, 23
+; SI-NEXT:    v_readlane_b32 s70, v20, 22
+; SI-NEXT:    v_readlane_b32 s69, v20, 21
+; SI-NEXT:    v_readlane_b32 s68, v20, 20
+; SI-NEXT:    v_readlane_b32 s67, v20, 19
+; SI-NEXT:    v_readlane_b32 s66, v20, 18
+; SI-NEXT:    v_readlane_b32 s65, v20, 17
+; SI-NEXT:    v_readlane_b32 s64, v20, 16
+; SI-NEXT:    v_readlane_b32 s55, v20, 15
+; SI-NEXT:    v_readlane_b32 s54, v20, 14
+; SI-NEXT:    v_readlane_b32 s53, v20, 13
+; SI-NEXT:    v_readlane_b32 s52, v20, 12
+; SI-NEXT:    v_readlane_b32 s51, v20, 11
+; SI-NEXT:    v_readlane_b32 s50, v20, 10
+; SI-NEXT:    v_readlane_b32 s49, v20, 9
+; SI-NEXT:    v_readlane_b32 s48, v20, 8
+; SI-NEXT:    v_readlane_b32 s39, v20, 7
+; SI-NEXT:    v_readlane_b32 s38, v20, 6
+; SI-NEXT:    v_readlane_b32 s37, v20, 5
+; SI-NEXT:    v_readlane_b32 s36, v20, 4
+; SI-NEXT:    v_readlane_b32 s35, v20, 3
+; SI-NEXT:    v_readlane_b32 s34, v20, 2
+; SI-NEXT:    v_readlane_b32 s31, v20, 1
+; SI-NEXT:    v_readlane_b32 s30, v20, 0
+; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB57_4:
-; SI-NEXT:    ; implicit-def: $vgpr40
-; SI-NEXT:    ; implicit-def: $vgpr55
-; SI-NEXT:    ; implicit-def: $vgpr54
-; SI-NEXT:    ; implicit-def: $sgpr71
-; SI-NEXT:    ; implicit-def: $sgpr70
-; SI-NEXT:    ; implicit-def: $sgpr69
-; SI-NEXT:    ; implicit-def: $vgpr53
-; SI-NEXT:    ; implicit-def: $vgpr52
-; SI-NEXT:    ; implicit-def: $vgpr51
-; SI-NEXT:    ; implicit-def: $sgpr68
-; SI-NEXT:    ; implicit-def: $sgpr67
-; SI-NEXT:    ; implicit-def: $sgpr66
-; SI-NEXT:    ; implicit-def: $vgpr50
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    ; implicit-def: $sgpr65
-; SI-NEXT:    ; implicit-def: $sgpr64
-; SI-NEXT:    ; implicit-def: $sgpr55
-; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    ; implicit-def: $vgpr38
-; SI-NEXT:    ; implicit-def: $vgpr37
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_writelane_b32 v22, s54, 0
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s55, 1
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
 ; SI-NEXT:    ; implicit-def: $sgpr54
 ; SI-NEXT:    ; implicit-def: $sgpr53
 ; SI-NEXT:    ; implicit-def: $sgpr52
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr34
-; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    ; implicit-def: $sgpr51
 ; SI-NEXT:    ; implicit-def: $sgpr50
-; SI-NEXT:    ; implicit-def: $sgpr49
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr27
 ; SI-NEXT:    ; implicit-def: $sgpr48
-; SI-NEXT:    ; implicit-def: $sgpr39
+; SI-NEXT:    ; implicit-def: $sgpr49
 ; SI-NEXT:    ; implicit-def: $sgpr38
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; implicit-def: $sgpr37
 ; SI-NEXT:    ; implicit-def: $sgpr36
-; SI-NEXT:    ; implicit-def: $sgpr35
 ; SI-NEXT:    ; implicit-def: $sgpr34
-; SI-NEXT:    ; implicit-def: $sgpr31
 ; SI-NEXT:    ; implicit-def: $sgpr30
-; SI-NEXT:    ; implicit-def: $sgpr95
 ; SI-NEXT:    ; implicit-def: $sgpr94
-; SI-NEXT:    ; implicit-def: $sgpr93
 ; SI-NEXT:    ; implicit-def: $sgpr92
-; SI-NEXT:    ; implicit-def: $sgpr91
 ; SI-NEXT:    ; implicit-def: $sgpr90
-; SI-NEXT:    ; implicit-def: $sgpr89
 ; SI-NEXT:    ; implicit-def: $sgpr88
-; SI-NEXT:    ; implicit-def: $sgpr79
 ; SI-NEXT:    ; implicit-def: $sgpr78
-; SI-NEXT:    ; implicit-def: $sgpr77
 ; SI-NEXT:    ; implicit-def: $sgpr76
-; SI-NEXT:    ; implicit-def: $sgpr75
 ; SI-NEXT:    ; implicit-def: $sgpr74
-; SI-NEXT:    ; implicit-def: $sgpr73
 ; SI-NEXT:    ; implicit-def: $sgpr72
-; SI-NEXT:    ; implicit-def: $sgpr63
 ; SI-NEXT:    ; implicit-def: $sgpr62
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $sgpr61
-; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $sgpr60
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $sgpr59
-; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    ; implicit-def: $sgpr58
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $sgpr57
-; SI-NEXT:    ; implicit-def: $vgpr25
 ; SI-NEXT:    ; implicit-def: $sgpr56
-; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    s_branch .LBB57_2
-;
-; VI-LABEL: bitcast_v16i64_to_v128i8_scalar:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
-; VI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; VI-NEXT:    s_mov_b64 exec, s[4:5]
-; VI-NEXT:    v_writelane_b32 v20, s30, 0
-; VI-NEXT:    v_writelane_b32 v20, s31, 1
-; VI-NEXT:    v_writelane_b32 v20, s34, 2
-; VI-NEXT:    v_writelane_b32 v20, s35, 3
-; VI-NEXT:    v_writelane_b32 v20, s36, 4
-; VI-NEXT:    v_writelane_b32 v20, s37, 5
-; VI-NEXT:    v_writelane_b32 v20, s38, 6
-; VI-NEXT:    v_writelane_b32 v20, s39, 7
-; VI-NEXT:    v_writelane_b32 v20, s48, 8
-; VI-NEXT:    v_writelane_b32 v20, s49, 9
-; VI-NEXT:    v_writelane_b32 v20, s50, 10
-; VI-NEXT:    v_writelane_b32 v20, s51, 11
-; VI-NEXT:    v_writelane_b32 v20, s52, 12
-; VI-NEXT:    v_writelane_b32 v20, s53, 13
-; VI-NEXT:    v_writelane_b32 v20, s54, 14
-; VI-NEXT:    v_writelane_b32 v20, s55, 15
-; VI-NEXT:    v_writelane_b32 v20, s64, 16
-; VI-NEXT:    v_writelane_b32 v20, s65, 17
-; VI-NEXT:    v_writelane_b32 v20, s66, 18
-; VI-NEXT:    v_writelane_b32 v20, s67, 19
-; VI-NEXT:    v_writelane_b32 v20, s68, 20
-; VI-NEXT:    v_writelane_b32 v20, s69, 21
-; VI-NEXT:    v_writelane_b32 v20, s70, 22
-; VI-NEXT:    v_writelane_b32 v20, s71, 23
-; VI-NEXT:    v_writelane_b32 v20, s80, 24
-; VI-NEXT:    v_writelane_b32 v20, s81, 25
-; VI-NEXT:    v_writelane_b32 v20, s82, 26
-; VI-NEXT:    v_writelane_b32 v20, s83, 27
-; VI-NEXT:    v_writelane_b32 v20, s84, 28
-; VI-NEXT:    v_writelane_b32 v20, s85, 29
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v19
-; VI-NEXT:    v_writelane_b32 v20, s86, 30
-; VI-NEXT:    v_readfirstlane_b32 s44, v1
-; VI-NEXT:    v_readfirstlane_b32 s45, v2
-; VI-NEXT:    v_readfirstlane_b32 s42, v3
-; VI-NEXT:    v_readfirstlane_b32 s43, v4
-; VI-NEXT:    v_readfirstlane_b32 s40, v5
-; VI-NEXT:    v_readfirstlane_b32 s41, v6
-; VI-NEXT:    v_readfirstlane_b32 s14, v7
-; VI-NEXT:    v_readfirstlane_b32 s15, v8
-; VI-NEXT:    v_readfirstlane_b32 s12, v9
-; VI-NEXT:    v_readfirstlane_b32 s13, v10
-; VI-NEXT:    v_readfirstlane_b32 s10, v11
-; VI-NEXT:    v_readfirstlane_b32 s11, v12
+; SI-NEXT:    ; implicit-def: $sgpr98
+; SI-NEXT:    ; implicit-def: $sgpr96
+; SI-NEXT:    ; implicit-def: $sgpr86
+; SI-NEXT:    ; implicit-def: $sgpr84
+; SI-NEXT:    ; implicit-def: $sgpr82
+; SI-NEXT:    ; implicit-def: $sgpr80
+; SI-NEXT:    ; implicit-def: $sgpr70
+; SI-NEXT:    ; implicit-def: $sgpr68
+; SI-NEXT:    ; implicit-def: $sgpr66
+; SI-NEXT:    ; implicit-def: $sgpr64
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s54, 2
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s55, 3
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s54, 4
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s55, 5
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s54, 6
+; SI-NEXT:    v_writelane_b32 v22, s55, 7
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s54, 8
+; SI-NEXT:    v_writelane_b32 v22, s55, 9
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s54, 10
+; SI-NEXT:    v_writelane_b32 v22, s55, 11
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s54, 12
+; SI-NEXT:    v_writelane_b32 v22, s55, 13
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s54, 14
+; SI-NEXT:    v_writelane_b32 v22, s55, 15
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s54, 16
+; SI-NEXT:    v_writelane_b32 v22, s55, 17
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s54, 18
+; SI-NEXT:    v_writelane_b32 v22, s55, 19
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s54, 20
+; SI-NEXT:    v_writelane_b32 v22, s55, 21
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s54, 22
+; SI-NEXT:    v_writelane_b32 v22, s55, 23
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s54, 24
+; SI-NEXT:    v_writelane_b32 v22, s55, 25
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s54, 26
+; SI-NEXT:    v_writelane_b32 v22, s55, 27
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s54, 28
+; SI-NEXT:    v_writelane_b32 v22, s55, 29
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s54, 30
+; SI-NEXT:    v_writelane_b32 v22, s55, 31
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s54, 32
+; SI-NEXT:    v_writelane_b32 v22, s55, 33
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s54, 34
+; SI-NEXT:    v_writelane_b32 v22, s55, 35
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s54, 36
+; SI-NEXT:    v_writelane_b32 v22, s55, 37
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v22, s54, 38
+; SI-NEXT:    v_writelane_b32 v22, s55, 39
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    s_branch .LBB57_2
+;
+; VI-LABEL: bitcast_v16i64_to_v128i8_scalar:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; VI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VI-NEXT:    s_mov_b64 exec, s[4:5]
+; VI-NEXT:    v_writelane_b32 v20, s30, 0
+; VI-NEXT:    v_writelane_b32 v20, s31, 1
+; VI-NEXT:    v_writelane_b32 v20, s34, 2
+; VI-NEXT:    v_writelane_b32 v20, s35, 3
+; VI-NEXT:    v_writelane_b32 v20, s36, 4
+; VI-NEXT:    v_writelane_b32 v20, s37, 5
+; VI-NEXT:    v_writelane_b32 v20, s38, 6
+; VI-NEXT:    v_writelane_b32 v20, s39, 7
+; VI-NEXT:    v_writelane_b32 v20, s48, 8
+; VI-NEXT:    v_writelane_b32 v20, s49, 9
+; VI-NEXT:    v_writelane_b32 v20, s50, 10
+; VI-NEXT:    v_writelane_b32 v20, s51, 11
+; VI-NEXT:    v_writelane_b32 v20, s52, 12
+; VI-NEXT:    v_writelane_b32 v20, s53, 13
+; VI-NEXT:    v_writelane_b32 v20, s54, 14
+; VI-NEXT:    v_writelane_b32 v20, s55, 15
+; VI-NEXT:    v_writelane_b32 v20, s64, 16
+; VI-NEXT:    v_writelane_b32 v20, s65, 17
+; VI-NEXT:    v_writelane_b32 v20, s66, 18
+; VI-NEXT:    v_writelane_b32 v20, s67, 19
+; VI-NEXT:    v_writelane_b32 v20, s68, 20
+; VI-NEXT:    v_writelane_b32 v20, s69, 21
+; VI-NEXT:    v_writelane_b32 v20, s70, 22
+; VI-NEXT:    v_writelane_b32 v20, s71, 23
+; VI-NEXT:    v_writelane_b32 v20, s80, 24
+; VI-NEXT:    v_writelane_b32 v20, s81, 25
+; VI-NEXT:    v_writelane_b32 v20, s82, 26
+; VI-NEXT:    v_writelane_b32 v20, s83, 27
+; VI-NEXT:    v_writelane_b32 v20, s84, 28
+; VI-NEXT:    v_writelane_b32 v20, s85, 29
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v19
+; VI-NEXT:    v_writelane_b32 v20, s86, 30
+; VI-NEXT:    v_readfirstlane_b32 s44, v1
+; VI-NEXT:    v_readfirstlane_b32 s45, v2
+; VI-NEXT:    v_readfirstlane_b32 s42, v3
+; VI-NEXT:    v_readfirstlane_b32 s43, v4
+; VI-NEXT:    v_readfirstlane_b32 s40, v5
+; VI-NEXT:    v_readfirstlane_b32 s41, v6
+; VI-NEXT:    v_readfirstlane_b32 s14, v7
+; VI-NEXT:    v_readfirstlane_b32 s15, v8
+; VI-NEXT:    v_readfirstlane_b32 s12, v9
+; VI-NEXT:    v_readfirstlane_b32 s13, v10
+; VI-NEXT:    v_readfirstlane_b32 s10, v11
+; VI-NEXT:    v_readfirstlane_b32 s11, v12
 ; VI-NEXT:    v_readfirstlane_b32 s8, v13
 ; VI-NEXT:    v_readfirstlane_b32 s9, v14
 ; VI-NEXT:    v_readfirstlane_b32 s6, v15
@@ -105964,385 +107224,431 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3
 ; SI-LABEL: bitcast_v16i64_to_v64i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_writelane_b32 v20, s30, 0
+; SI-NEXT:    v_writelane_b32 v20, s31, 1
+; SI-NEXT:    v_writelane_b32 v20, s34, 2
+; SI-NEXT:    v_writelane_b32 v20, s35, 3
+; SI-NEXT:    v_writelane_b32 v20, s36, 4
+; SI-NEXT:    v_writelane_b32 v20, s37, 5
+; SI-NEXT:    v_writelane_b32 v20, s38, 6
+; SI-NEXT:    v_writelane_b32 v20, s39, 7
+; SI-NEXT:    v_writelane_b32 v20, s48, 8
+; SI-NEXT:    v_writelane_b32 v20, s49, 9
+; SI-NEXT:    v_writelane_b32 v20, s50, 10
+; SI-NEXT:    v_writelane_b32 v20, s51, 11
+; SI-NEXT:    v_writelane_b32 v20, s52, 12
+; SI-NEXT:    v_writelane_b32 v20, s53, 13
+; SI-NEXT:    v_writelane_b32 v20, s54, 14
+; SI-NEXT:    v_writelane_b32 v20, s55, 15
+; SI-NEXT:    v_writelane_b32 v20, s64, 16
+; SI-NEXT:    v_writelane_b32 v20, s65, 17
+; SI-NEXT:    v_writelane_b32 v20, s66, 18
+; SI-NEXT:    v_writelane_b32 v20, s67, 19
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v19
-; SI-NEXT:    v_readfirstlane_b32 s47, v1
-; SI-NEXT:    v_readfirstlane_b32 s46, v2
-; SI-NEXT:    v_readfirstlane_b32 s45, v3
-; SI-NEXT:    v_readfirstlane_b32 s44, v4
-; SI-NEXT:    v_readfirstlane_b32 s43, v5
-; SI-NEXT:    v_readfirstlane_b32 s42, v6
-; SI-NEXT:    v_readfirstlane_b32 s41, v7
-; SI-NEXT:    v_readfirstlane_b32 s40, v8
-; SI-NEXT:    v_readfirstlane_b32 s15, v9
-; SI-NEXT:    v_readfirstlane_b32 s14, v10
-; SI-NEXT:    v_readfirstlane_b32 s13, v11
-; SI-NEXT:    v_readfirstlane_b32 s12, v12
-; SI-NEXT:    v_readfirstlane_b32 s11, v13
-; SI-NEXT:    v_readfirstlane_b32 s10, v14
-; SI-NEXT:    v_readfirstlane_b32 s9, v15
-; SI-NEXT:    v_readfirstlane_b32 s8, v16
-; SI-NEXT:    v_readfirstlane_b32 s7, v17
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_readfirstlane_b32 s6, v18
+; SI-NEXT:    v_writelane_b32 v20, s68, 20
+; SI-NEXT:    v_readfirstlane_b32 s44, v1
+; SI-NEXT:    v_readfirstlane_b32 s45, v2
+; SI-NEXT:    v_readfirstlane_b32 s42, v3
+; SI-NEXT:    v_readfirstlane_b32 s43, v4
+; SI-NEXT:    v_readfirstlane_b32 s40, v5
+; SI-NEXT:    v_readfirstlane_b32 s41, v6
+; SI-NEXT:    v_readfirstlane_b32 s14, v7
+; SI-NEXT:    v_readfirstlane_b32 s15, v8
+; SI-NEXT:    v_readfirstlane_b32 s12, v9
+; SI-NEXT:    v_readfirstlane_b32 s13, v10
+; SI-NEXT:    v_readfirstlane_b32 s10, v11
+; SI-NEXT:    v_readfirstlane_b32 s11, v12
+; SI-NEXT:    v_readfirstlane_b32 s8, v13
+; SI-NEXT:    v_readfirstlane_b32 s9, v14
+; SI-NEXT:    v_readfirstlane_b32 s6, v15
+; SI-NEXT:    v_readfirstlane_b32 s7, v16
+; SI-NEXT:    v_readfirstlane_b32 s4, v17
+; SI-NEXT:    s_and_b64 s[46:47], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s5, v18
+; SI-NEXT:    v_writelane_b32 v20, s69, 21
 ; SI-NEXT:    s_cbranch_scc0 .LBB69_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s13
-; SI-NEXT:    v_mov_b32_e32 v5, s15
-; SI-NEXT:    v_mov_b32_e32 v6, s41
-; SI-NEXT:    v_mov_b32_e32 v7, s43
-; SI-NEXT:    v_mov_b32_e32 v8, s45
-; SI-NEXT:    v_mov_b32_e32 v9, s47
-; SI-NEXT:    v_mov_b32_e32 v10, s28
-; SI-NEXT:    v_mov_b32_e32 v11, s26
-; SI-NEXT:    v_mov_b32_e32 v12, s24
-; SI-NEXT:    v_mov_b32_e32 v13, s22
-; SI-NEXT:    v_mov_b32_e32 v14, s20
-; SI-NEXT:    v_mov_b32_e32 v15, s18
-; SI-NEXT:    v_mov_b32_e32 v16, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s10, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s12, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s14, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s40, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s42, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s44, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s46, v9, 16
-; SI-NEXT:    v_alignbit_b32 v10, s29, v10, 16
-; SI-NEXT:    v_alignbit_b32 v11, s27, v11, 16
-; SI-NEXT:    v_alignbit_b32 v12, s25, v12, 16
-; SI-NEXT:    v_alignbit_b32 v13, s23, v13, 16
-; SI-NEXT:    v_alignbit_b32 v14, s21, v14, 16
-; SI-NEXT:    v_alignbit_b32 v15, s19, v15, 16
-; SI-NEXT:    v_alignbit_b32 v16, s17, v16, 16
-; SI-NEXT:    s_lshr_b32 s56, s6, 16
-; SI-NEXT:    s_lshr_b32 s57, s8, 16
-; SI-NEXT:    s_lshr_b32 s58, s10, 16
-; SI-NEXT:    s_lshr_b32 s59, s12, 16
-; SI-NEXT:    s_lshr_b32 s60, s14, 16
-; SI-NEXT:    s_lshr_b32 s61, s40, 16
-; SI-NEXT:    s_lshr_b32 s62, s42, 16
-; SI-NEXT:    s_lshr_b32 s63, s44, 16
-; SI-NEXT:    s_lshr_b32 s72, s46, 16
-; SI-NEXT:    s_lshr_b32 s73, s29, 16
-; SI-NEXT:    s_lshr_b32 s74, s27, 16
-; SI-NEXT:    s_lshr_b32 s75, s25, 16
-; SI-NEXT:    s_lshr_b32 s76, s23, 16
-; SI-NEXT:    s_lshr_b32 s77, s21, 16
-; SI-NEXT:    s_lshr_b32 s78, s19, 16
-; SI-NEXT:    s_lshr_b32 s79, s17, 16
+; SI-NEXT:    s_lshr_b32 s38, s5, 16
+; SI-NEXT:    s_lshr_b32 s39, s7, 16
+; SI-NEXT:    s_lshr_b32 s48, s9, 16
+; SI-NEXT:    s_lshr_b32 s49, s11, 16
+; SI-NEXT:    s_lshr_b32 s50, s13, 16
+; SI-NEXT:    s_lshr_b32 s51, s15, 16
+; SI-NEXT:    s_lshr_b32 s52, s41, 16
+; SI-NEXT:    s_lshr_b32 s53, s43, 16
+; SI-NEXT:    s_lshr_b32 s54, s45, 16
+; SI-NEXT:    s_lshr_b32 s55, s29, 16
+; SI-NEXT:    s_lshr_b32 s64, s27, 16
+; SI-NEXT:    s_lshr_b32 s65, s25, 16
+; SI-NEXT:    s_lshr_b32 s66, s23, 16
+; SI-NEXT:    s_lshr_b32 s67, s21, 16
+; SI-NEXT:    s_lshr_b32 s68, s19, 16
+; SI-NEXT:    s_lshr_b32 s69, s17, 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[72:73], s[14:15], 16
+; SI-NEXT:    s_lshr_b64 s[74:75], s[40:41], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[42:43], 16
+; SI-NEXT:    s_lshr_b64 s[78:79], s[44:45], 16
+; SI-NEXT:    s_lshr_b64 s[88:89], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[90:91], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[92:93], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[94:95], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[30:31], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[34:35], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[36:37], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB69_3
 ; SI-NEXT:  .LBB69_2: ; %cmp.true
-; SI-NEXT:    s_add_u32 s16, s16, 3
-; SI-NEXT:    s_addc_u32 s17, s17, 0
-; SI-NEXT:    s_add_u32 s18, s18, 3
-; SI-NEXT:    s_addc_u32 s19, s19, 0
-; SI-NEXT:    s_add_u32 s20, s20, 3
-; SI-NEXT:    s_addc_u32 s21, s21, 0
-; SI-NEXT:    s_add_u32 s22, s22, 3
-; SI-NEXT:    s_addc_u32 s23, s23, 0
-; SI-NEXT:    s_add_u32 s24, s24, 3
-; SI-NEXT:    s_addc_u32 s25, s25, 0
-; SI-NEXT:    s_add_u32 s26, s26, 3
-; SI-NEXT:    s_addc_u32 s27, s27, 0
+; SI-NEXT:    s_add_u32 s4, s4, 3
+; SI-NEXT:    s_addc_u32 s5, s5, 0
+; SI-NEXT:    s_add_u32 s6, s6, 3
+; SI-NEXT:    s_addc_u32 s7, s7, 0
+; SI-NEXT:    s_add_u32 s8, s8, 3
+; SI-NEXT:    s_addc_u32 s9, s9, 0
+; SI-NEXT:    s_add_u32 s10, s10, 3
+; SI-NEXT:    s_addc_u32 s11, s11, 0
+; SI-NEXT:    s_add_u32 s12, s12, 3
+; SI-NEXT:    s_addc_u32 s13, s13, 0
+; SI-NEXT:    s_add_u32 s14, s14, 3
+; SI-NEXT:    s_addc_u32 s15, s15, 0
+; SI-NEXT:    s_add_u32 s40, s40, 3
+; SI-NEXT:    s_addc_u32 s41, s41, 0
+; SI-NEXT:    s_add_u32 s42, s42, 3
+; SI-NEXT:    s_addc_u32 s43, s43, 0
+; SI-NEXT:    s_add_u32 s44, s44, 3
+; SI-NEXT:    s_addc_u32 s45, s45, 0
 ; SI-NEXT:    s_add_u32 s28, s28, 3
 ; SI-NEXT:    s_addc_u32 s29, s29, 0
-; SI-NEXT:    s_add_u32 s47, s47, 3
-; SI-NEXT:    s_addc_u32 s46, s46, 0
-; SI-NEXT:    s_add_u32 s45, s45, 3
-; SI-NEXT:    s_addc_u32 s44, s44, 0
-; SI-NEXT:    s_add_u32 s43, s43, 3
-; SI-NEXT:    s_addc_u32 s42, s42, 0
-; SI-NEXT:    s_add_u32 s41, s41, 3
-; SI-NEXT:    s_addc_u32 s40, s40, 0
-; SI-NEXT:    s_add_u32 s15, s15, 3
-; SI-NEXT:    s_addc_u32 s14, s14, 0
-; SI-NEXT:    s_add_u32 s13, s13, 3
-; SI-NEXT:    s_addc_u32 s12, s12, 0
-; SI-NEXT:    s_add_u32 s11, s11, 3
-; SI-NEXT:    s_addc_u32 s10, s10, 0
-; SI-NEXT:    s_add_u32 s9, s9, 3
-; SI-NEXT:    s_addc_u32 s8, s8, 0
-; SI-NEXT:    s_add_u32 s7, s7, 3
-; SI-NEXT:    s_addc_u32 s6, s6, 0
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s13
-; SI-NEXT:    v_mov_b32_e32 v5, s15
-; SI-NEXT:    v_mov_b32_e32 v6, s41
-; SI-NEXT:    v_mov_b32_e32 v7, s43
-; SI-NEXT:    v_mov_b32_e32 v8, s45
-; SI-NEXT:    v_mov_b32_e32 v9, s47
-; SI-NEXT:    v_mov_b32_e32 v10, s28
-; SI-NEXT:    v_mov_b32_e32 v11, s26
-; SI-NEXT:    v_mov_b32_e32 v12, s24
-; SI-NEXT:    v_mov_b32_e32 v13, s22
-; SI-NEXT:    v_mov_b32_e32 v14, s20
-; SI-NEXT:    v_mov_b32_e32 v15, s18
-; SI-NEXT:    v_mov_b32_e32 v16, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s10, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s12, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s14, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s40, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s42, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s44, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s46, v9, 16
-; SI-NEXT:    v_alignbit_b32 v10, s29, v10, 16
-; SI-NEXT:    v_alignbit_b32 v11, s27, v11, 16
-; SI-NEXT:    v_alignbit_b32 v12, s25, v12, 16
-; SI-NEXT:    v_alignbit_b32 v13, s23, v13, 16
-; SI-NEXT:    v_alignbit_b32 v14, s21, v14, 16
-; SI-NEXT:    v_alignbit_b32 v15, s19, v15, 16
-; SI-NEXT:    v_alignbit_b32 v16, s17, v16, 16
-; SI-NEXT:    s_lshr_b32 s56, s6, 16
-; SI-NEXT:    s_lshr_b32 s57, s8, 16
-; SI-NEXT:    s_lshr_b32 s58, s10, 16
-; SI-NEXT:    s_lshr_b32 s59, s12, 16
-; SI-NEXT:    s_lshr_b32 s60, s14, 16
-; SI-NEXT:    s_lshr_b32 s61, s40, 16
-; SI-NEXT:    s_lshr_b32 s62, s42, 16
-; SI-NEXT:    s_lshr_b32 s63, s44, 16
-; SI-NEXT:    s_lshr_b32 s72, s46, 16
-; SI-NEXT:    s_lshr_b32 s73, s29, 16
-; SI-NEXT:    s_lshr_b32 s74, s27, 16
-; SI-NEXT:    s_lshr_b32 s75, s25, 16
-; SI-NEXT:    s_lshr_b32 s76, s23, 16
-; SI-NEXT:    s_lshr_b32 s77, s21, 16
-; SI-NEXT:    s_lshr_b32 s78, s19, 16
-; SI-NEXT:    s_lshr_b32 s79, s17, 16
+; SI-NEXT:    s_add_u32 s26, s26, 3
+; SI-NEXT:    s_addc_u32 s27, s27, 0
+; SI-NEXT:    s_add_u32 s24, s24, 3
+; SI-NEXT:    s_addc_u32 s25, s25, 0
+; SI-NEXT:    s_add_u32 s22, s22, 3
+; SI-NEXT:    s_addc_u32 s23, s23, 0
+; SI-NEXT:    s_add_u32 s20, s20, 3
+; SI-NEXT:    s_addc_u32 s21, s21, 0
+; SI-NEXT:    s_add_u32 s18, s18, 3
+; SI-NEXT:    s_addc_u32 s19, s19, 0
+; SI-NEXT:    s_add_u32 s16, s16, 3
+; SI-NEXT:    s_addc_u32 s17, s17, 0
+; SI-NEXT:    s_lshr_b32 s38, s5, 16
+; SI-NEXT:    s_lshr_b32 s39, s7, 16
+; SI-NEXT:    s_lshr_b32 s48, s9, 16
+; SI-NEXT:    s_lshr_b32 s49, s11, 16
+; SI-NEXT:    s_lshr_b32 s50, s13, 16
+; SI-NEXT:    s_lshr_b32 s51, s15, 16
+; SI-NEXT:    s_lshr_b32 s52, s41, 16
+; SI-NEXT:    s_lshr_b32 s53, s43, 16
+; SI-NEXT:    s_lshr_b32 s54, s45, 16
+; SI-NEXT:    s_lshr_b32 s55, s29, 16
+; SI-NEXT:    s_lshr_b32 s64, s27, 16
+; SI-NEXT:    s_lshr_b32 s65, s25, 16
+; SI-NEXT:    s_lshr_b32 s66, s23, 16
+; SI-NEXT:    s_lshr_b32 s67, s21, 16
+; SI-NEXT:    s_lshr_b32 s68, s19, 16
+; SI-NEXT:    s_lshr_b32 s69, s17, 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[72:73], s[14:15], 16
+; SI-NEXT:    s_lshr_b64 s[74:75], s[40:41], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[42:43], 16
+; SI-NEXT:    s_lshr_b64 s[78:79], s[44:45], 16
+; SI-NEXT:    s_lshr_b64 s[88:89], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[90:91], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[92:93], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[94:95], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[30:31], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[34:35], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[36:37], s[16:17], 16
 ; SI-NEXT:  .LBB69_3: ; %end
-; SI-NEXT:    s_and_b32 s4, s16, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; SI-NEXT:    v_or_b32_e32 v16, s4, v16
-; SI-NEXT:    s_and_b32 s4, s17, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s79, 16
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_mov_b32_e32 v17, s4
-; SI-NEXT:    s_and_b32 s4, s18, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; SI-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen
+; SI-NEXT:    s_lshl_b32 s47, s36, 16
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s47
+; SI-NEXT:    v_mov_b32_e32 v1, s16
+; SI-NEXT:    s_and_b32 s16, s17, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s69, 16
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_lshl_b32 s16, s34, 16
+; SI-NEXT:    s_and_b32 s17, s18, 0xffff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_mov_b32_e32 v3, s16
+; SI-NEXT:    s_and_b32 s16, s19, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s68, 16
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v4, s16
+; SI-NEXT:    s_lshl_b32 s16, s30, 16
+; SI-NEXT:    s_and_b32 s17, s20, 0xffff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v16, vcc, 4, v0
-; SI-NEXT:    v_or_b32_e32 v15, s4, v15
-; SI-NEXT:    s_and_b32 s4, s19, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s78, 16
-; SI-NEXT:    buffer_store_dword v17, v16, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v16, vcc, 8, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v16, s4
-; SI-NEXT:    s_and_b32 s4, s20, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; SI-NEXT:    v_mov_b32_e32 v5, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; SI-NEXT:    s_and_b32 s16, s21, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s67, 16
+; SI-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s22, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s94, 16
+; SI-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 20, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v15, vcc, 12, v0
-; SI-NEXT:    v_or_b32_e32 v14, s4, v14
-; SI-NEXT:    s_and_b32 s4, s21, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s77, 16
-; SI-NEXT:    buffer_store_dword v16, v15, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v15, vcc, 16, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v14, v15, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v15, s4
-; SI-NEXT:    s_and_b32 s4, s22, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s23, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s66, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 24, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 20, v0
-; SI-NEXT:    v_or_b32_e32 v13, s4, v13
-; SI-NEXT:    s_and_b32 s4, s23, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s76, 16
-; SI-NEXT:    buffer_store_dword v15, v14, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 24, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v14, s4
-; SI-NEXT:    s_and_b32 s4, s24, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s24, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s92, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 28, v0
-; SI-NEXT:    v_or_b32_e32 v12, s4, v12
-; SI-NEXT:    s_and_b32 s4, s25, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s75, 16
-; SI-NEXT:    buffer_store_dword v14, v13, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 32, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v12, v13, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v13, s4
-; SI-NEXT:    s_and_b32 s4, s26, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s25, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s65, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 36, v0
-; SI-NEXT:    v_or_b32_e32 v11, s4, v11
-; SI-NEXT:    s_and_b32 s4, s27, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s74, 16
-; SI-NEXT:    buffer_store_dword v13, v12, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 40, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v11, v12, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v12, s4
-; SI-NEXT:    s_and_b32 s4, s28, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s26, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s90, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 36, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 44, v0
-; SI-NEXT:    v_or_b32_e32 v10, s4, v10
-; SI-NEXT:    s_and_b32 s4, s29, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s73, 16
-; SI-NEXT:    buffer_store_dword v12, v11, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v11, s4
-; SI-NEXT:    s_and_b32 s4, s47, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s27, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s64, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 40, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 52, v0
-; SI-NEXT:    v_or_b32_e32 v9, s4, v9
-; SI-NEXT:    s_and_b32 s4, s46, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s72, 16
-; SI-NEXT:    buffer_store_dword v11, v10, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 56, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v10, s4
-; SI-NEXT:    s_and_b32 s4, s45, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s28, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s88, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 44, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 60, v0
-; SI-NEXT:    v_or_b32_e32 v8, s4, v8
-; SI-NEXT:    s_and_b32 s4, s44, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s63, 16
-; SI-NEXT:    buffer_store_dword v10, v9, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 64, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v8, v9, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v9, s4
-; SI-NEXT:    s_and_b32 s4, s43, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s29, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s55, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 48, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x44, v0
-; SI-NEXT:    v_or_b32_e32 v7, s4, v7
-; SI-NEXT:    s_and_b32 s4, s42, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s62, 16
-; SI-NEXT:    buffer_store_dword v9, v8, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v8, s4
-; SI-NEXT:    s_and_b32 s4, s41, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s44, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s78, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 52, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x4c, v0
-; SI-NEXT:    v_or_b32_e32 v6, s4, v6
-; SI-NEXT:    s_and_b32 s4, s40, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s61, 16
-; SI-NEXT:    buffer_store_dword v8, v7, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x50, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    s_and_b32 s4, s15, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s45, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s54, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 56, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x54, v0
-; SI-NEXT:    v_or_b32_e32 v5, s4, v5
-; SI-NEXT:    s_and_b32 s4, s14, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s60, 16
-; SI-NEXT:    buffer_store_dword v7, v6, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x58, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v6, s4
-; SI-NEXT:    s_and_b32 s4, s13, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s42, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s76, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 60, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x5c, v0
-; SI-NEXT:    v_or_b32_e32 v4, s4, v4
-; SI-NEXT:    s_and_b32 s4, s12, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s59, 16
-; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x60, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v5, s4
-; SI-NEXT:    s_and_b32 s4, s11, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s43, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s53, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 64, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x64, v0
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s10, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s58, 16
-; SI-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x68, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v4, s4
-; SI-NEXT:    s_and_b32 s4, s9, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s40, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s74, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x44, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x6c, v0
-; SI-NEXT:    v_or_b32_e32 v2, s4, v2
-; SI-NEXT:    s_and_b32 s4, s8, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s57, 16
-; SI-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x70, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v3, s4
-; SI-NEXT:    s_and_b32 s4, s7, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s41, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s52, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x48, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
-; SI-NEXT:    v_or_b32_e32 v1, s4, v1
-; SI-NEXT:    s_and_b32 s4, s6, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s56, 16
-; SI-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s14, s14, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s72, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x4c, v0
+; SI-NEXT:    s_or_b32 s14, s14, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s14
+; SI-NEXT:    s_and_b32 s14, s15, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s51, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x50, v0
+; SI-NEXT:    s_or_b32 s14, s14, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s14
+; SI-NEXT:    s_and_b32 s12, s12, 0xffff
+; SI-NEXT:    s_lshl_b32 s14, s62, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x54, v0
+; SI-NEXT:    s_or_b32 s12, s12, s14
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    s_and_b32 s12, s13, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s50, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x58, v0
+; SI-NEXT:    s_or_b32 s12, s12, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_lshl_b32 s12, s60, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x5c, v0
+; SI-NEXT:    s_or_b32 s10, s10, s12
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s10, s11, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s49, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x60, v0
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    s_lshl_b32 s10, s58, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x64, v0
+; SI-NEXT:    s_or_b32 s8, s8, s10
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s8, s9, 0xffff
+; SI-NEXT:    s_lshl_b32 s9, s48, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x68, v0
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_lshl_b32 s8, s56, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x6c, v0
+; SI-NEXT:    s_or_b32 s6, s6, s8
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s6, s7, 0xffff
+; SI-NEXT:    s_lshl_b32 s7, s39, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x70, v0
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; SI-NEXT:    s_lshl_b32 s6, s46, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x74, v0
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    s_and_b32 s4, s5, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s38, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x78, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    v_readlane_b32 s69, v20, 21
+; SI-NEXT:    v_readlane_b32 s68, v20, 20
+; SI-NEXT:    v_readlane_b32 s67, v20, 19
+; SI-NEXT:    v_readlane_b32 s66, v20, 18
+; SI-NEXT:    v_readlane_b32 s65, v20, 17
+; SI-NEXT:    v_readlane_b32 s64, v20, 16
+; SI-NEXT:    v_readlane_b32 s55, v20, 15
+; SI-NEXT:    v_readlane_b32 s54, v20, 14
+; SI-NEXT:    v_readlane_b32 s53, v20, 13
+; SI-NEXT:    v_readlane_b32 s52, v20, 12
+; SI-NEXT:    v_readlane_b32 s51, v20, 11
+; SI-NEXT:    v_readlane_b32 s50, v20, 10
+; SI-NEXT:    v_readlane_b32 s49, v20, 9
+; SI-NEXT:    v_readlane_b32 s48, v20, 8
+; SI-NEXT:    v_readlane_b32 s39, v20, 7
+; SI-NEXT:    v_readlane_b32 s38, v20, 6
+; SI-NEXT:    v_readlane_b32 s37, v20, 5
+; SI-NEXT:    v_readlane_b32 s36, v20, 4
+; SI-NEXT:    v_readlane_b32 s35, v20, 3
+; SI-NEXT:    v_readlane_b32 s34, v20, 2
+; SI-NEXT:    v_readlane_b32 s31, v20, 1
+; SI-NEXT:    v_readlane_b32 s30, v20, 0
+; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB69_4:
-; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $sgpr79
-; SI-NEXT:    ; implicit-def: $vgpr15
+; SI-NEXT:    ; implicit-def: $sgpr36
+; SI-NEXT:    ; implicit-def: $sgpr69
+; SI-NEXT:    ; implicit-def: $sgpr34
+; SI-NEXT:    ; implicit-def: $sgpr68
+; SI-NEXT:    ; implicit-def: $sgpr30
+; SI-NEXT:    ; implicit-def: $sgpr67
+; SI-NEXT:    ; implicit-def: $sgpr94
+; SI-NEXT:    ; implicit-def: $sgpr66
+; SI-NEXT:    ; implicit-def: $sgpr92
+; SI-NEXT:    ; implicit-def: $sgpr65
+; SI-NEXT:    ; implicit-def: $sgpr90
+; SI-NEXT:    ; implicit-def: $sgpr64
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    ; implicit-def: $sgpr55
 ; SI-NEXT:    ; implicit-def: $sgpr78
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $sgpr77
-; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $sgpr54
 ; SI-NEXT:    ; implicit-def: $sgpr76
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $sgpr75
-; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $sgpr53
 ; SI-NEXT:    ; implicit-def: $sgpr74
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $sgpr73
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $sgpr52
 ; SI-NEXT:    ; implicit-def: $sgpr72
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $sgpr63
-; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $sgpr51
 ; SI-NEXT:    ; implicit-def: $sgpr62
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $sgpr61
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $sgpr50
 ; SI-NEXT:    ; implicit-def: $sgpr60
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $sgpr59
-; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $sgpr49
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr39
+; SI-NEXT:    ; implicit-def: $sgpr38
 ; SI-NEXT:    ; implicit-def: $sgpr58
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $sgpr57
-; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; implicit-def: $sgpr56
+; SI-NEXT:    ; implicit-def: $sgpr46
 ; SI-NEXT:    s_branch .LBB69_2
 ;
 ; VI-LABEL: bitcast_v16i64_to_v64i16_scalar:
@@ -112766,656 +114072,1312 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; SI-LABEL: bitcast_v16f64_to_v128i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_writelane_b32 v63, s30, 0
+; SI-NEXT:    v_writelane_b32 v63, s31, 1
+; SI-NEXT:    v_writelane_b32 v63, s34, 2
+; SI-NEXT:    v_writelane_b32 v63, s35, 3
+; SI-NEXT:    v_writelane_b32 v63, s36, 4
+; SI-NEXT:    v_writelane_b32 v63, s37, 5
+; SI-NEXT:    v_writelane_b32 v63, s38, 6
+; SI-NEXT:    v_writelane_b32 v63, s39, 7
+; SI-NEXT:    v_writelane_b32 v63, s48, 8
+; SI-NEXT:    v_writelane_b32 v63, s49, 9
+; SI-NEXT:    v_writelane_b32 v63, s50, 10
+; SI-NEXT:    v_writelane_b32 v63, s51, 11
+; SI-NEXT:    v_writelane_b32 v63, s52, 12
+; SI-NEXT:    v_writelane_b32 v63, s53, 13
+; SI-NEXT:    v_writelane_b32 v63, s54, 14
+; SI-NEXT:    v_writelane_b32 v63, s55, 15
+; SI-NEXT:    v_writelane_b32 v63, s64, 16
+; SI-NEXT:    v_writelane_b32 v63, s65, 17
+; SI-NEXT:    v_writelane_b32 v63, s66, 18
+; SI-NEXT:    v_writelane_b32 v63, s67, 19
+; SI-NEXT:    v_writelane_b32 v63, s68, 20
+; SI-NEXT:    v_writelane_b32 v63, s69, 21
+; SI-NEXT:    v_writelane_b32 v63, s70, 22
+; SI-NEXT:    v_writelane_b32 v63, s71, 23
+; SI-NEXT:    v_writelane_b32 v63, s80, 24
+; SI-NEXT:    v_writelane_b32 v63, s81, 25
+; SI-NEXT:    v_writelane_b32 v63, s82, 26
+; SI-NEXT:    v_writelane_b32 v63, s83, 27
+; SI-NEXT:    v_writelane_b32 v63, s84, 28
+; SI-NEXT:    v_writelane_b32 v63, s85, 29
+; SI-NEXT:    v_writelane_b32 v63, s86, 30
+; SI-NEXT:    v_writelane_b32 v63, s87, 31
+; SI-NEXT:    v_writelane_b32 v63, s96, 32
+; SI-NEXT:    v_writelane_b32 v63, s97, 33
+; SI-NEXT:    v_writelane_b32 v63, s98, 34
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v19
-; SI-NEXT:    v_mov_b32_e32 v31, s16
-; SI-NEXT:    v_mov_b32_e32 v32, s17
-; SI-NEXT:    v_mov_b32_e32 v29, s18
-; SI-NEXT:    v_mov_b32_e32 v30, s19
-; SI-NEXT:    v_mov_b32_e32 v27, s20
-; SI-NEXT:    v_mov_b32_e32 v28, s21
-; SI-NEXT:    v_mov_b32_e32 v25, s22
-; SI-NEXT:    v_mov_b32_e32 v26, s23
-; SI-NEXT:    v_mov_b32_e32 v23, s24
-; SI-NEXT:    v_mov_b32_e32 v24, s25
-; SI-NEXT:    v_mov_b32_e32 v21, s26
-; SI-NEXT:    v_mov_b32_e32 v22, s27
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mov_b32_e32 v19, s28
-; SI-NEXT:    v_mov_b32_e32 v20, s29
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    s_cbranch_scc0 .LBB73_4
+; SI-NEXT:    v_writelane_b32 v63, s99, 35
+; SI-NEXT:    v_readfirstlane_b32 s4, v1
+; SI-NEXT:    v_readfirstlane_b32 s5, v2
+; SI-NEXT:    v_readfirstlane_b32 s6, v3
+; SI-NEXT:    v_readfirstlane_b32 s7, v4
+; SI-NEXT:    v_readfirstlane_b32 s8, v5
+; SI-NEXT:    v_readfirstlane_b32 s9, v6
+; SI-NEXT:    v_readfirstlane_b32 s10, v7
+; SI-NEXT:    v_readfirstlane_b32 s11, v8
+; SI-NEXT:    v_readfirstlane_b32 s12, v9
+; SI-NEXT:    v_readfirstlane_b32 s13, v10
+; SI-NEXT:    v_readfirstlane_b32 s14, v11
+; SI-NEXT:    v_readfirstlane_b32 s15, v12
+; SI-NEXT:    v_readfirstlane_b32 s40, v13
+; SI-NEXT:    v_readfirstlane_b32 s41, v14
+; SI-NEXT:    v_readfirstlane_b32 s42, v15
+; SI-NEXT:    v_readfirstlane_b32 s43, v16
+; SI-NEXT:    v_readfirstlane_b32 s44, v17
+; SI-NEXT:    s_and_b64 s[46:47], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s45, v18
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr61 : SGPR spill to VGPR lane
+; SI-NEXT:    ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
+; SI-NEXT:    s_cbranch_scc0 .LBB73_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_alignbit_b32 v33, v18, v17, 24
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v18, v17, 16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v18, v17, 8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v16, v15, 24
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v16, v15, 16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v16, v15, 8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v14, v13, 24
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v14, v13, 16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v14, v13, 8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v12, v11, 24
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v12, v11, 16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v12, v11, 8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v10, v9, 24
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v10, v9, 16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v10, v9, 8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v8, v7, 24
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v8, v7, 16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v8, v7, 8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v6, v5, 24
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v6, v5, 16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v6, v5, 8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v4, v3, 24
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v4, v3, 16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v4, v3, 8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v2, v1, 24
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v2, v1, 16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v2, v1, 8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v20, v19, 24
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v20, v19, 16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v20, v19, 8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v22, v21, 24
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v22, v21, 16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v22, v21, 8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v24, v23, 24
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v24, v23, 16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v24, v23, 8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    s_lshr_b32 s46, s45, 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 34
+; SI-NEXT:    s_lshr_b32 s46, s45, 16
+; SI-NEXT:    v_writelane_b32 v61, s46, 35
+; SI-NEXT:    s_lshr_b32 s46, s45, 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 36
+; SI-NEXT:    s_lshr_b32 s46, s43, 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 37
+; SI-NEXT:    s_lshr_b32 s46, s43, 16
+; SI-NEXT:    v_writelane_b32 v61, s46, 38
+; SI-NEXT:    s_lshr_b32 s46, s43, 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 39
+; SI-NEXT:    s_lshr_b32 s46, s41, 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 40
+; SI-NEXT:    s_lshr_b32 s46, s41, 16
+; SI-NEXT:    v_writelane_b32 v61, s46, 41
+; SI-NEXT:    s_lshr_b32 s46, s41, 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 42
+; SI-NEXT:    s_lshr_b32 s46, s15, 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 43
+; SI-NEXT:    s_lshr_b32 s46, s15, 16
+; SI-NEXT:    v_writelane_b32 v61, s46, 44
+; SI-NEXT:    s_lshr_b32 s46, s15, 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 45
+; SI-NEXT:    s_lshr_b32 s46, s13, 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 46
+; SI-NEXT:    s_lshr_b32 s46, s13, 16
+; SI-NEXT:    v_writelane_b32 v61, s46, 47
+; SI-NEXT:    s_lshr_b32 s46, s13, 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 48
+; SI-NEXT:    s_lshr_b32 s46, s11, 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 49
+; SI-NEXT:    s_lshr_b32 s46, s11, 16
+; SI-NEXT:    v_writelane_b32 v61, s46, 50
+; SI-NEXT:    s_lshr_b32 s46, s11, 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 51
+; SI-NEXT:    s_lshr_b32 s46, s9, 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 52
+; SI-NEXT:    s_lshr_b32 s46, s9, 16
+; SI-NEXT:    v_writelane_b32 v61, s46, 53
+; SI-NEXT:    s_lshr_b32 s46, s9, 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 54
+; SI-NEXT:    s_lshr_b32 s46, s7, 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 55
+; SI-NEXT:    s_lshr_b32 s46, s7, 16
+; SI-NEXT:    v_writelane_b32 v61, s46, 56
+; SI-NEXT:    s_lshr_b32 s46, s7, 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 57
+; SI-NEXT:    s_lshr_b32 s46, s5, 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 58
+; SI-NEXT:    s_lshr_b32 s46, s5, 16
+; SI-NEXT:    v_writelane_b32 v61, s46, 59
+; SI-NEXT:    s_lshr_b32 s46, s5, 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 60
+; SI-NEXT:    s_lshr_b32 s46, s29, 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 61
+; SI-NEXT:    s_lshr_b32 s46, s29, 16
+; SI-NEXT:    v_writelane_b32 v61, s46, 62
+; SI-NEXT:    s_lshr_b32 s46, s29, 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 63
+; SI-NEXT:    s_lshr_b32 s46, s27, 24
+; SI-NEXT:    v_writelane_b32 v62, s46, 0
+; SI-NEXT:    s_lshr_b32 s46, s27, 16
+; SI-NEXT:    v_writelane_b32 v62, s46, 1
+; SI-NEXT:    s_lshr_b32 s46, s27, 8
+; SI-NEXT:    v_writelane_b32 v62, s46, 2
+; SI-NEXT:    s_lshr_b32 s46, s25, 24
+; SI-NEXT:    v_writelane_b32 v62, s46, 3
+; SI-NEXT:    s_lshr_b32 s46, s25, 16
+; SI-NEXT:    v_writelane_b32 v62, s46, 4
+; SI-NEXT:    s_lshr_b32 s46, s25, 8
+; SI-NEXT:    v_writelane_b32 v62, s46, 5
+; SI-NEXT:    s_lshr_b32 s46, s23, 24
+; SI-NEXT:    v_writelane_b32 v62, s46, 6
+; SI-NEXT:    s_lshr_b32 s46, s23, 16
+; SI-NEXT:    v_writelane_b32 v62, s46, 7
+; SI-NEXT:    s_lshr_b32 s46, s23, 8
+; SI-NEXT:    v_writelane_b32 v62, s46, 8
+; SI-NEXT:    s_lshr_b32 s46, s21, 24
+; SI-NEXT:    v_writelane_b32 v62, s46, 9
+; SI-NEXT:    s_lshr_b32 s46, s21, 16
+; SI-NEXT:    v_writelane_b32 v62, s46, 10
+; SI-NEXT:    s_lshr_b32 s46, s21, 8
+; SI-NEXT:    v_writelane_b32 v62, s46, 11
+; SI-NEXT:    s_lshr_b32 s46, s19, 24
+; SI-NEXT:    v_writelane_b32 v62, s46, 12
+; SI-NEXT:    s_lshr_b32 s46, s19, 16
+; SI-NEXT:    v_writelane_b32 v62, s46, 13
+; SI-NEXT:    s_lshr_b32 s46, s19, 8
+; SI-NEXT:    v_writelane_b32 v62, s46, 14
+; SI-NEXT:    s_lshr_b32 s46, s17, 24
+; SI-NEXT:    v_writelane_b32 v62, s46, 15
+; SI-NEXT:    s_lshr_b32 s46, s17, 16
+; SI-NEXT:    v_writelane_b32 v62, s46, 16
+; SI-NEXT:    s_lshr_b32 s46, s17, 8
+; SI-NEXT:    v_writelane_b32 v62, s46, 17
+; SI-NEXT:    s_lshr_b64 s[46:47], s[44:45], 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 32
+; SI-NEXT:    v_writelane_b32 v61, s47, 33
+; SI-NEXT:    s_lshr_b64 s[46:47], s[44:45], 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 30
+; SI-NEXT:    v_writelane_b32 v61, s47, 31
+; SI-NEXT:    s_lshr_b64 s[46:47], s[42:43], 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 28
+; SI-NEXT:    v_writelane_b32 v61, s47, 29
+; SI-NEXT:    s_lshr_b64 s[46:47], s[42:43], 16
+; SI-NEXT:    v_writelane_b32 v61, s46, 26
+; SI-NEXT:    v_writelane_b32 v61, s47, 27
+; SI-NEXT:    s_lshr_b64 s[46:47], s[42:43], 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 24
+; SI-NEXT:    v_writelane_b32 v61, s47, 25
+; SI-NEXT:    s_lshr_b64 s[46:47], s[40:41], 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 22
+; SI-NEXT:    v_writelane_b32 v61, s47, 23
+; SI-NEXT:    s_lshr_b64 s[46:47], s[40:41], 16
+; SI-NEXT:    v_writelane_b32 v61, s46, 20
+; SI-NEXT:    v_writelane_b32 v61, s47, 21
+; SI-NEXT:    s_lshr_b64 s[46:47], s[40:41], 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 18
+; SI-NEXT:    v_writelane_b32 v61, s47, 19
+; SI-NEXT:    s_lshr_b64 s[46:47], s[14:15], 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 16
+; SI-NEXT:    v_writelane_b32 v61, s47, 17
+; SI-NEXT:    s_lshr_b64 s[46:47], s[14:15], 16
+; SI-NEXT:    v_writelane_b32 v61, s46, 14
+; SI-NEXT:    v_writelane_b32 v61, s47, 15
+; SI-NEXT:    s_lshr_b64 s[46:47], s[14:15], 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 12
+; SI-NEXT:    v_writelane_b32 v61, s47, 13
+; SI-NEXT:    s_lshr_b64 s[46:47], s[12:13], 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 10
+; SI-NEXT:    v_writelane_b32 v61, s47, 11
+; SI-NEXT:    s_lshr_b64 s[46:47], s[12:13], 16
+; SI-NEXT:    v_writelane_b32 v61, s46, 8
+; SI-NEXT:    v_writelane_b32 v61, s47, 9
+; SI-NEXT:    s_lshr_b64 s[46:47], s[12:13], 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 6
+; SI-NEXT:    v_writelane_b32 v61, s47, 7
+; SI-NEXT:    s_lshr_b64 s[46:47], s[10:11], 24
+; SI-NEXT:    v_writelane_b32 v61, s46, 4
+; SI-NEXT:    v_writelane_b32 v61, s47, 5
+; SI-NEXT:    s_lshr_b64 s[46:47], s[10:11], 16
+; SI-NEXT:    v_writelane_b32 v61, s46, 2
+; SI-NEXT:    v_writelane_b32 v61, s47, 3
+; SI-NEXT:    s_lshr_b64 s[46:47], s[10:11], 8
+; SI-NEXT:    v_writelane_b32 v61, s46, 0
+; SI-NEXT:    s_lshr_b64 s[48:49], s[44:45], 16
+; SI-NEXT:    v_writelane_b32 v61, s47, 1
+; SI-NEXT:    s_lshr_b64 s[50:51], s[8:9], 24
+; SI-NEXT:    s_lshr_b64 s[52:53], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[54:55], s[8:9], 8
+; SI-NEXT:    s_lshr_b64 s[64:65], s[6:7], 24
+; SI-NEXT:    s_lshr_b64 s[66:67], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[68:69], s[6:7], 8
+; SI-NEXT:    s_lshr_b64 s[70:71], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[80:81], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[82:83], s[4:5], 8
+; SI-NEXT:    s_lshr_b64 s[84:85], s[28:29], 24
+; SI-NEXT:    s_lshr_b64 s[86:87], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[96:97], s[28:29], 8
+; SI-NEXT:    s_lshr_b64 s[98:99], s[26:27], 24
+; SI-NEXT:    s_lshr_b64 s[46:47], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[26:27], 8
+; SI-NEXT:    s_lshr_b64 s[58:59], s[24:25], 24
+; SI-NEXT:    s_lshr_b64 s[60:61], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[24:25], 8
+; SI-NEXT:    s_lshr_b64 s[72:73], s[22:23], 24
+; SI-NEXT:    s_lshr_b64 s[74:75], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[22:23], 8
+; SI-NEXT:    s_lshr_b64 s[78:79], s[20:21], 24
+; SI-NEXT:    s_lshr_b64 s[88:89], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[90:91], s[20:21], 8
+; SI-NEXT:    s_lshr_b64 s[92:93], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[94:95], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[30:31], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[34:35], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[36:37], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[38:39], s[16:17], 8
+; SI-NEXT:    s_cbranch_execnz .LBB73_4
+; SI-NEXT:  .LBB73_2: ; %cmp.true
+; SI-NEXT:    v_add_f64 v[5:6], s[40:41], 1.0
+; SI-NEXT:    v_add_f64 v[7:8], s[14:15], 1.0
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v6
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v26, v25, 24
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 8, v6
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v26, v25, 16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 24, v8
+; SI-NEXT:    v_add_f64 v[9:10], s[12:13], 1.0
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v26, v25, 8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v8
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 24, v18
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 8, v8
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 8, v18
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 24, v10
+; SI-NEXT:    v_add_f64 v[11:12], s[10:11], 1.0
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 8, v10
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 8, v16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 24, v12
+; SI-NEXT:    v_add_f64 v[13:14], s[8:9], 1.0
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v12
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 8, v12
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 24, v14
+; SI-NEXT:    v_add_f64 v[15:16], s[6:7], 1.0
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v14
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 8, v14
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 24, v16
+; SI-NEXT:    v_add_f64 v[17:18], s[4:5], 1.0
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 24, v10
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v16
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 8, v16
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 8, v10
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 24, v18
+; SI-NEXT:    v_add_f64 v[19:20], s[28:29], 1.0
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 24, v8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v18
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 8, v18
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 8, v8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 24, v20
+; SI-NEXT:    v_add_f64 v[21:22], s[26:27], 1.0
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 24, v6
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v20
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 8, v20
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 8, v6
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 24, v22
+; SI-NEXT:    v_add_f64 v[23:24], s[24:25], 1.0
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 24, v4
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v22
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 8, v22
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 8, v4
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 24, v24
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 24, v2
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v24
+; SI-NEXT:    v_add_f64 v[38:39], s[22:23], 1.0
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 8, v24
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 8, v2
-; SI-NEXT:    v_alignbit_b32 v38, v28, v27, 24
-; SI-NEXT:    v_alignbit_b32 v48, v28, v27, 16
-; SI-NEXT:    v_alignbit_b32 v50, v28, v27, 8
-; SI-NEXT:    v_alignbit_b32 v52, v30, v29, 24
-; SI-NEXT:    v_alignbit_b32 v54, v30, v29, 16
-; SI-NEXT:    v_alignbit_b32 v40, v30, v29, 8
-; SI-NEXT:    v_alignbit_b32 v42, v32, v31, 24
-; SI-NEXT:    v_alignbit_b32 v44, v32, v31, 16
-; SI-NEXT:    v_alignbit_b32 v46, v32, v31, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v37, 24, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 8, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 24, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v55, 8, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v41, 24, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 8, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v47, 24, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v57, 8, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v58, 24, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v60, 8, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v61, 24, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v63, 8, v30
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 24, v39
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 24, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 8, v32
-; SI-NEXT:    s_cbranch_execnz .LBB73_3
-; SI-NEXT:  .LBB73_2: ; %cmp.true
-; SI-NEXT:    v_add_f64 v[17:18], v[17:18], 1.0
-; SI-NEXT:    v_add_f64 v[15:16], v[15:16], 1.0
-; SI-NEXT:    v_alignbit_b32 v33, v18, v17, 24
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v39
+; SI-NEXT:    v_add_f64 v[52:53], s[20:21], 1.0
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v18, v17, 16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 8, v39
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v18, v17, 8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 24, v53
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v16, v15, 24
-; SI-NEXT:    v_add_f64 v[13:14], v[13:14], 1.0
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v53
+; SI-NEXT:    v_add_f64 v[44:45], s[18:19], 1.0
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v16, v15, 16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 8, v53
+; SI-NEXT:    v_add_f64 v[1:2], s[44:45], 1.0
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v16, v15, 8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 24, v45
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v14, v13, 24
-; SI-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v45
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v14, v13, 16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[25:26], v[1:2], 24
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v14, v13, 8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[25:26], v[1:2], 16
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v12, v11, 24
-; SI-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[25:26], v[1:2], 8
+; SI-NEXT:    v_add_f64 v[3:4], s[42:43], 1.0
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v12, v11, 16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[25:26], v[3:4], 24
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v12, v11, 8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[25:26], v[3:4], 16
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v10, v9, 24
-; SI-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[25:26], v[3:4], 8
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v10, v9, 16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[25:26], v[5:6], 24
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v10, v9, 8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[25:26], v[5:6], 16
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v8, v7, 24
-; SI-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[25:26], v[5:6], 8
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v8, v7, 16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[25:26], v[7:8], 24
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v8, v7, 8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[25:26], v[7:8], 16
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v6, v5, 24
-; SI-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[25:26], v[7:8], 8
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v6, v5, 16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[25:26], v[9:10], 24
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v6, v5, 8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[25:26], v[9:10], 16
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v4, v3, 24
-; SI-NEXT:    v_add_f64 v[1:2], v[1:2], 1.0
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[25:26], v[9:10], 8
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v4, v3, 16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[25:26], v[11:12], 24
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v4, v3, 8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[25:26], v[11:12], 16
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v2, v1, 24
-; SI-NEXT:    v_add_f64 v[19:20], v[19:20], 1.0
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[25:26], v[11:12], 8
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v2, v1, 16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[25:26], v[13:14], 24
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v2, v1, 8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[25:26], v[13:14], 16
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v20, v19, 24
-; SI-NEXT:    v_add_f64 v[21:22], v[21:22], 1.0
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[25:26], v[13:14], 8
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v20, v19, 16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[25:26], v[15:16], 24
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v20, v19, 8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[25:26], v[15:16], 16
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v22, v21, 24
-; SI-NEXT:    v_add_f64 v[23:24], v[23:24], 1.0
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[25:26], v[15:16], 8
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v22, v21, 16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[25:26], v[17:18], 24
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v22, v21, 8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[25:26], v[17:18], 16
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v24, v23, 24
-; SI-NEXT:    v_add_f64 v[25:26], v[25:26], 1.0
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[25:26], v[17:18], 8
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[25:26], v[19:20], 24
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[25:26], v[19:20], 16
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[25:26], v[19:20], 8
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[25:26], v[21:22], 24
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[25:26], v[21:22], 16
+; SI-NEXT:    v_lshr_b64 v[47:48], v[23:24], 16
+; SI-NEXT:    v_add_f64 v[58:59], s[16:17], 1.0
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[35:36], v[21:22], 8
+; SI-NEXT:    v_lshr_b64 v[48:49], v[23:24], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 24, v2
+; SI-NEXT:    v_lshr_b64 v[36:37], v[23:24], 24
+; SI-NEXT:    v_lshr_b64 v[49:50], v[38:39], 24
+; SI-NEXT:    v_lshr_b64 v[40:41], v[38:39], 8
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[25:26], v[44:45], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v2
+; SI-NEXT:    v_mov_b32_e32 v37, v27
+; SI-NEXT:    v_lshr_b64 v[50:51], v[38:39], 16
+; SI-NEXT:    v_lshr_b64 v[41:42], v[52:53], 24
+; SI-NEXT:    v_lshr_b64 v[54:55], v[52:53], 8
+; SI-NEXT:    v_lshr_b64 v[26:27], v[58:59], 24
+; SI-NEXT:    v_lshrrev_b32_e32 v29, 8, v2
+; SI-NEXT:    v_mov_b32_e32 v51, v28
+; SI-NEXT:    v_lshr_b64 v[42:43], v[52:53], 16
+; SI-NEXT:    v_lshr_b64 v[55:56], v[44:45], 24
+; SI-NEXT:    v_lshr_b64 v[27:28], v[58:59], 16
+; SI-NEXT:    v_mov_b32_e32 v43, v29
+; SI-NEXT:    v_lshr_b64 v[56:57], v[44:45], 16
+; SI-NEXT:    v_lshr_b64 v[28:29], v[58:59], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 8, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 24, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v60, 8, v45
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 24, v59
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v59
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 8, v59
+; SI-NEXT:    v_lshrrev_b32_e32 v57, 24, v4
+; SI-NEXT:    s_branch .LBB73_5
+; SI-NEXT:  .LBB73_3:
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 0
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s49, 1
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr38
+; SI-NEXT:    ; implicit-def: $sgpr36
+; SI-NEXT:    ; implicit-def: $sgpr34
+; SI-NEXT:    ; implicit-def: $sgpr30
+; SI-NEXT:    ; implicit-def: $sgpr94
+; SI-NEXT:    ; implicit-def: $sgpr92
+; SI-NEXT:    ; implicit-def: $sgpr90
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    ; implicit-def: $sgpr78
+; SI-NEXT:    ; implicit-def: $sgpr76
+; SI-NEXT:    ; implicit-def: $sgpr74
+; SI-NEXT:    ; implicit-def: $sgpr72
+; SI-NEXT:    ; implicit-def: $sgpr62
+; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr58
+; SI-NEXT:    ; implicit-def: $sgpr56
+; SI-NEXT:    ; implicit-def: $sgpr98
+; SI-NEXT:    ; implicit-def: $sgpr96
+; SI-NEXT:    ; implicit-def: $sgpr86
+; SI-NEXT:    ; implicit-def: $sgpr84
+; SI-NEXT:    ; implicit-def: $sgpr82
+; SI-NEXT:    ; implicit-def: $sgpr80
+; SI-NEXT:    ; implicit-def: $sgpr70
+; SI-NEXT:    ; implicit-def: $sgpr68
+; SI-NEXT:    ; implicit-def: $sgpr66
+; SI-NEXT:    ; implicit-def: $sgpr64
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; implicit-def: $sgpr52
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 2
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s49, 3
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 4
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s49, 5
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 6
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s49, 7
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 8
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s49, 9
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 10
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s49, 11
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 12
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s49, 13
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 14
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s49, 15
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 16
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s49, 17
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 18
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s49, 19
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 20
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s49, 21
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 22
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s49, 23
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 24
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s49, 25
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 26
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s49, 27
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 28
+; SI-NEXT:    v_writelane_b32 v61, s49, 29
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 30
+; SI-NEXT:    v_writelane_b32 v61, s49, 31
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    v_writelane_b32 v61, s48, 32
+; SI-NEXT:    v_writelane_b32 v61, s49, 33
+; SI-NEXT:    ; kill: killed $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    s_branch .LBB73_2
+; SI-NEXT:  .LBB73_4:
+; SI-NEXT:    v_mov_b32_e32 v17, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 34
+; SI-NEXT:    v_mov_b32_e32 v37, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 35
+; SI-NEXT:    v_mov_b32_e32 v51, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 36
+; SI-NEXT:    v_mov_b32_e32 v43, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 37
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    v_mov_b32_e32 v57, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 38
+; SI-NEXT:    v_mov_b32_e32 v33, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 39
+; SI-NEXT:    v_mov_b32_e32 v30, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 40
+; SI-NEXT:    v_mov_b32_e32 v34, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 41
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 42
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v24, v23, 16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 43
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v24, v23, 8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 44
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v26, v25, 24
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 45
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v26, v25, 16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 46
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v33, v26, v25, 8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 47
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 24, v18
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 48
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 8, v18
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 49
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 24, v16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 50
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 51
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 8, v16
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 52
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 24, v14
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 53
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 54
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 8, v14
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 55
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 24, v12
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 56
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 57
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 8, v12
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 58
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 24, v10
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 59
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 60
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 8, v10
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 61
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 24, v8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 62
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 63
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 8, v8
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 0
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 24, v6
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 1
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 2
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 8, v6
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 3
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 24, v4
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 4
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 5
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 8, v4
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 6
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 24, v2
-; SI-NEXT:    v_add_f64 v[31:32], v[31:32], 1.0
-; SI-NEXT:    v_add_f64 v[29:30], v[29:30], 1.0
-; SI-NEXT:    v_add_f64 v[27:28], v[27:28], 1.0
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 7
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 8
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 8, v2
-; SI-NEXT:    v_alignbit_b32 v38, v28, v27, 24
-; SI-NEXT:    v_alignbit_b32 v48, v28, v27, 16
-; SI-NEXT:    v_alignbit_b32 v50, v28, v27, 8
-; SI-NEXT:    v_alignbit_b32 v52, v30, v29, 24
-; SI-NEXT:    v_alignbit_b32 v54, v30, v29, 16
-; SI-NEXT:    v_alignbit_b32 v40, v30, v29, 8
-; SI-NEXT:    v_alignbit_b32 v42, v32, v31, 24
-; SI-NEXT:    v_alignbit_b32 v44, v32, v31, 16
-; SI-NEXT:    v_alignbit_b32 v46, v32, v31, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v37, 24, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 8, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 24, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v55, 8, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v41, 24, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 8, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v47, 24, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v57, 8, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v58, 24, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v60, 8, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v61, 24, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v63, 8, v30
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 9
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 24, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 8, v32
-; SI-NEXT:  .LBB73_3: ; %end
-; SI-NEXT:    v_lshlrev_b32_e32 v46, 8, v46
-; SI-NEXT:    v_and_b32_e32 v31, 0xff, v31
-; SI-NEXT:    v_and_b32_e32 v44, 0xff, v44
-; SI-NEXT:    v_or_b32_e32 v31, v31, v46
-; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v44
-; SI-NEXT:    v_lshlrev_b32_e32 v42, 24, v42
-; SI-NEXT:    v_or_b32_e32 v42, v42, v44
-; SI-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; SI-NEXT:    v_or_b32_e32 v31, v31, v42
-; SI-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 10
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v31, 0xff, v32
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 8, v35
-; SI-NEXT:    v_or_b32_e32 v31, v31, v32
-; SI-NEXT:    v_and_b32_e32 v32, 0xff, v34
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 24, v33
-; SI-NEXT:    v_or_b32_e32 v32, v33, v32
-; SI-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; SI-NEXT:    v_or_b32_e32 v31, v31, v32
-; SI-NEXT:    v_add_i32_e32 v32, vcc, 4, v0
-; SI-NEXT:    buffer_store_dword v31, v32, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 11
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v31, 8, v40
-; SI-NEXT:    v_and_b32_e32 v29, 0xff, v29
-; SI-NEXT:    v_or_b32_e32 v29, v29, v31
-; SI-NEXT:    v_and_b32_e32 v31, 0xff, v54
-; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 24, v52
-; SI-NEXT:    v_or_b32_e32 v31, v32, v31
-; SI-NEXT:    v_and_b32_e32 v29, 0xffff, v29
-; SI-NEXT:    v_or_b32_e32 v29, v29, v31
-; SI-NEXT:    v_add_i32_e32 v31, vcc, 8, v0
-; SI-NEXT:    buffer_store_dword v29, v31, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 12
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v29, 0xff, v30
-; SI-NEXT:    v_lshlrev_b32_e32 v30, 8, v63
-; SI-NEXT:    v_or_b32_e32 v29, v29, v30
-; SI-NEXT:    v_and_b32_e32 v30, 0xff, v62
-; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; SI-NEXT:    v_lshlrev_b32_e32 v31, 24, v61
-; SI-NEXT:    v_or_b32_e32 v30, v31, v30
-; SI-NEXT:    v_and_b32_e32 v29, 0xffff, v29
-; SI-NEXT:    v_or_b32_e32 v29, v29, v30
-; SI-NEXT:    v_add_i32_e32 v30, vcc, 12, v0
-; SI-NEXT:    buffer_store_dword v29, v30, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 13
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 8, v50
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_mov_b32_e32 v29, s46
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s98
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s96
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s86
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s84
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s82
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s80
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s70
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s68
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s66
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s64
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s54
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s52
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s50
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s4, v62, 14
+; SI-NEXT:    v_mov_b32_e32 v60, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 15
+; SI-NEXT:    v_mov_b32_e32 v31, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 16
+; SI-NEXT:    v_mov_b32_e32 v32, s4
+; SI-NEXT:    v_readlane_b32 s4, v62, 17
+; SI-NEXT:    v_mov_b32_e32 v18, s5
+; SI-NEXT:    v_mov_b32_e32 v46, s4
+; SI-NEXT:    v_readlane_b32 s4, v61, 0
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 1
+; SI-NEXT:    v_readlane_b32 s4, v61, 2
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 3
+; SI-NEXT:    v_readlane_b32 s4, v61, 4
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 5
+; SI-NEXT:    v_readlane_b32 s4, v61, 6
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 7
+; SI-NEXT:    v_readlane_b32 s4, v61, 8
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 9
+; SI-NEXT:    v_readlane_b32 s4, v61, 10
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 11
+; SI-NEXT:    v_readlane_b32 s4, v61, 12
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 13
+; SI-NEXT:    v_readlane_b32 s4, v61, 14
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 15
+; SI-NEXT:    v_readlane_b32 s4, v61, 16
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 17
+; SI-NEXT:    v_readlane_b32 s4, v61, 18
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 19
+; SI-NEXT:    v_readlane_b32 s4, v61, 20
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 21
+; SI-NEXT:    v_readlane_b32 s4, v61, 22
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 23
+; SI-NEXT:    v_readlane_b32 s4, v61, 24
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 25
+; SI-NEXT:    v_readlane_b32 s4, v61, 26
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 27
+; SI-NEXT:    v_readlane_b32 s4, v61, 28
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 29
+; SI-NEXT:    v_readlane_b32 s4, v61, 30
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s48
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    v_readlane_b32 s5, v61, 31
+; SI-NEXT:    v_readlane_b32 s4, v61, 32
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v29, s4
+; SI-NEXT:    v_mov_b32_e32 v59, s17
+; SI-NEXT:    v_mov_b32_e32 v58, s16
+; SI-NEXT:    v_mov_b32_e32 v45, s19
+; SI-NEXT:    v_mov_b32_e32 v44, s18
+; SI-NEXT:    v_mov_b32_e32 v53, s21
+; SI-NEXT:    v_mov_b32_e32 v52, s20
+; SI-NEXT:    v_mov_b32_e32 v39, s23
+; SI-NEXT:    v_mov_b32_e32 v38, s22
+; SI-NEXT:    v_mov_b32_e32 v24, s25
+; SI-NEXT:    v_mov_b32_e32 v23, s24
+; SI-NEXT:    v_mov_b32_e32 v22, s27
+; SI-NEXT:    v_mov_b32_e32 v21, s26
+; SI-NEXT:    v_mov_b32_e32 v20, s29
+; SI-NEXT:    v_mov_b32_e32 v19, s28
+; SI-NEXT:    v_mov_b32_e32 v16, s7
+; SI-NEXT:    v_mov_b32_e32 v15, s6
+; SI-NEXT:    v_mov_b32_e32 v14, s9
+; SI-NEXT:    v_mov_b32_e32 v13, s8
+; SI-NEXT:    v_mov_b32_e32 v12, s11
+; SI-NEXT:    v_mov_b32_e32 v11, s10
+; SI-NEXT:    v_mov_b32_e32 v10, s13
+; SI-NEXT:    v_mov_b32_e32 v9, s12
+; SI-NEXT:    v_mov_b32_e32 v8, s15
+; SI-NEXT:    v_mov_b32_e32 v7, s14
+; SI-NEXT:    v_mov_b32_e32 v6, s41
+; SI-NEXT:    v_mov_b32_e32 v5, s40
+; SI-NEXT:    v_mov_b32_e32 v4, s43
+; SI-NEXT:    v_mov_b32_e32 v3, s42
+; SI-NEXT:    v_mov_b32_e32 v2, s45
+; SI-NEXT:    v_mov_b32_e32 v1, s44
+; SI-NEXT:    v_mov_b32_e32 v28, s38
+; SI-NEXT:    v_mov_b32_e32 v27, s36
+; SI-NEXT:    v_mov_b32_e32 v26, s34
+; SI-NEXT:    v_mov_b32_e32 v25, s30
+; SI-NEXT:    v_mov_b32_e32 v56, s94
+; SI-NEXT:    v_mov_b32_e32 v55, s92
+; SI-NEXT:    v_mov_b32_e32 v54, s90
+; SI-NEXT:    v_mov_b32_e32 v42, s88
+; SI-NEXT:    v_mov_b32_e32 v41, s78
+; SI-NEXT:    v_mov_b32_e32 v40, s76
+; SI-NEXT:    v_mov_b32_e32 v50, s74
+; SI-NEXT:    v_mov_b32_e32 v49, s72
+; SI-NEXT:    v_mov_b32_e32 v48, s62
+; SI-NEXT:    v_mov_b32_e32 v47, s60
+; SI-NEXT:    v_mov_b32_e32 v36, s58
+; SI-NEXT:    v_mov_b32_e32 v35, s56
+; SI-NEXT:    v_readlane_b32 s5, v61, 33
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:  .LBB73_5: ; %end
+; SI-NEXT:    v_lshlrev_b32_e32 v28, 8, v28
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_and_b32_e32 v29, 0xff, v58
 ; SI-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; SI-NEXT:    v_or_b32_e32 v27, v27, v29
-; SI-NEXT:    v_and_b32_e32 v29, 0xff, v48
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v30, 24, v38
-; SI-NEXT:    v_or_b32_e32 v29, v30, v29
-; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; SI-NEXT:    v_or_b32_e32 v27, v27, v29
-; SI-NEXT:    v_add_i32_e32 v29, vcc, 16, v0
-; SI-NEXT:    buffer_store_dword v27, v29, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v27, 0xff, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 8, v60
-; SI-NEXT:    v_or_b32_e32 v27, v27, v28
-; SI-NEXT:    v_and_b32_e32 v28, 0xff, v59
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 24, v58
 ; SI-NEXT:    v_or_b32_e32 v28, v29, v28
-; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; SI-NEXT:    v_or_b32_e32 v27, v27, v28
-; SI-NEXT:    v_add_i32_e32 v28, vcc, 20, v0
-; SI-NEXT:    buffer_store_dword v27, v28, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 24, v26
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; SI-NEXT:    v_or_b32_e32 v26, v26, v27
+; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v28
+; SI-NEXT:    v_or_b32_e32 v26, v27, v26
+; SI-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v25, 0xff, v25
+; SI-NEXT:    v_and_b32_e32 v26, 0xff, v59
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 8, v46
+; SI-NEXT:    v_or_b32_e32 v26, v26, v27
+; SI-NEXT:    v_and_b32_e32 v27, 0xff, v32
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v28, 24, v31
+; SI-NEXT:    v_or_b32_e32 v27, v28, v27
+; SI-NEXT:    v_and_b32_e32 v26, 0xffff, v26
+; SI-NEXT:    v_or_b32_e32 v26, v26, v27
+; SI-NEXT:    v_add_i32_e32 v27, vcc, 4, v0
+; SI-NEXT:    buffer_store_dword v26, v27, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 8, v25
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v26, 0xff, v44
+; SI-NEXT:    v_or_b32_e32 v25, v26, v25
+; SI-NEXT:    v_and_b32_e32 v26, 0xff, v56
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 24, v55
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; SI-NEXT:    v_or_b32_e32 v26, v27, v26
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; SI-NEXT:    v_or_b32_e32 v25, v25, v26
+; SI-NEXT:    v_add_i32_e32 v26, vcc, 8, v0
+; SI-NEXT:    buffer_store_dword v25, v26, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v25, 0xff, v45
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 8, v60
+; SI-NEXT:    v_or_b32_e32 v25, v25, v26
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v25
 ; SI-NEXT:    v_and_b32_e32 v23, 0xff, v23
 ; SI-NEXT:    v_and_b32_e32 v21, 0xff, v21
 ; SI-NEXT:    v_and_b32_e32 v19, 0xff, v19
+; SI-NEXT:    v_and_b32_e32 v17, 0xff, v17
+; SI-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; SI-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT:    v_readlane_b32 s99, v63, 35
+; SI-NEXT:    v_readlane_b32 s98, v63, 34
+; SI-NEXT:    v_readlane_b32 s97, v63, 33
+; SI-NEXT:    v_readlane_b32 s96, v63, 32
+; SI-NEXT:    v_readlane_b32 s87, v63, 31
+; SI-NEXT:    v_readlane_b32 s86, v63, 30
+; SI-NEXT:    v_readlane_b32 s85, v63, 29
+; SI-NEXT:    v_readlane_b32 s84, v63, 28
+; SI-NEXT:    v_readlane_b32 s83, v63, 27
+; SI-NEXT:    v_readlane_b32 s82, v63, 26
+; SI-NEXT:    v_readlane_b32 s81, v63, 25
+; SI-NEXT:    v_readlane_b32 s80, v63, 24
+; SI-NEXT:    v_readlane_b32 s71, v63, 23
+; SI-NEXT:    v_readlane_b32 s70, v63, 22
+; SI-NEXT:    v_readlane_b32 s69, v63, 21
+; SI-NEXT:    v_readlane_b32 s68, v63, 20
+; SI-NEXT:    v_readlane_b32 s67, v63, 19
+; SI-NEXT:    v_readlane_b32 s66, v63, 18
+; SI-NEXT:    v_readlane_b32 s65, v63, 17
+; SI-NEXT:    v_readlane_b32 s64, v63, 16
+; SI-NEXT:    v_readlane_b32 s55, v63, 15
+; SI-NEXT:    v_readlane_b32 s54, v63, 14
+; SI-NEXT:    v_readlane_b32 s53, v63, 13
+; SI-NEXT:    v_readlane_b32 s52, v63, 12
+; SI-NEXT:    v_readlane_b32 s51, v63, 11
+; SI-NEXT:    v_readlane_b32 s50, v63, 10
+; SI-NEXT:    v_readlane_b32 s49, v63, 9
+; SI-NEXT:    v_readlane_b32 s48, v63, 8
+; SI-NEXT:    v_readlane_b32 s39, v63, 7
+; SI-NEXT:    v_readlane_b32 s38, v63, 6
+; SI-NEXT:    v_readlane_b32 s37, v63, 5
+; SI-NEXT:    v_readlane_b32 s36, v63, 4
+; SI-NEXT:    v_readlane_b32 s35, v63, 3
+; SI-NEXT:    v_readlane_b32 s34, v63, 2
+; SI-NEXT:    v_readlane_b32 s31, v63, 1
+; SI-NEXT:    v_readlane_b32 s30, v63, 0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 8, v27
-; SI-NEXT:    v_or_b32_e32 v25, v25, v27
-; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v26, 0xff, v26
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 24, v27
+; SI-NEXT:    v_or_b32_e32 v26, v27, v26
+; SI-NEXT:    v_or_b32_e32 v25, v25, v26
+; SI-NEXT:    v_add_i32_e32 v26, vcc, 12, v0
+; SI-NEXT:    buffer_store_dword v25, v26, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 8, v54
+; SI-NEXT:    v_and_b32_e32 v26, 0xff, v52
+; SI-NEXT:    v_or_b32_e32 v25, v26, v25
+; SI-NEXT:    v_and_b32_e32 v26, 0xff, v42
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 24, v41
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; SI-NEXT:    v_or_b32_e32 v26, v27, v26
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; SI-NEXT:    v_or_b32_e32 v25, v25, v26
+; SI-NEXT:    v_add_i32_e32 v26, vcc, 16, v0
+; SI-NEXT:    buffer_store_dword v25, v26, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v25, 0xff, v53
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 8, v26
+; SI-NEXT:    v_or_b32_e32 v25, v25, v26
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v25
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 24, v28
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 24, v27
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v27, 0xff, v27
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; SI-NEXT:    v_or_b32_e32 v27, v28, v27
-; SI-NEXT:    v_or_b32_e32 v25, v25, v27
-; SI-NEXT:    v_add_i32_e32 v27, vcc, 24, v0
-; SI-NEXT:    buffer_store_dword v25, v27, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v25, 0xff, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v26, 8, v57
+; SI-NEXT:    v_and_b32_e32 v26, 0xff, v26
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; SI-NEXT:    v_or_b32_e32 v26, v27, v26
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v26
-; SI-NEXT:    v_and_b32_e32 v26, 0xff, v56
+; SI-NEXT:    v_add_i32_e32 v26, vcc, 20, v0
+; SI-NEXT:    buffer_store_dword v25, v26, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 8, v40
+; SI-NEXT:    v_and_b32_e32 v26, 0xff, v38
+; SI-NEXT:    v_or_b32_e32 v25, v26, v25
+; SI-NEXT:    v_and_b32_e32 v26, 0xff, v50
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 24, v49
 ; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 24, v47
 ; SI-NEXT:    v_or_b32_e32 v26, v27, v26
 ; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v25
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v26
-; SI-NEXT:    v_add_i32_e32 v26, vcc, 28, v0
+; SI-NEXT:    v_add_i32_e32 v26, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v25, v26, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v25, 0xff, v39
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 8, v25
-; SI-NEXT:    v_or_b32_e32 v23, v23, v25
-; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 8, v26
+; SI-NEXT:    v_or_b32_e32 v25, v25, v26
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v25
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v26, 24, v26
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 24, v27
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v25, 0xff, v25
+; SI-NEXT:    v_and_b32_e32 v26, 0xff, v26
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; SI-NEXT:    v_or_b32_e32 v26, v27, v26
+; SI-NEXT:    v_or_b32_e32 v25, v25, v26
+; SI-NEXT:    v_add_i32_e32 v26, vcc, 28, v0
+; SI-NEXT:    buffer_store_dword v25, v26, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 8, v48
+; SI-NEXT:    v_or_b32_e32 v23, v23, v25
+; SI-NEXT:    v_and_b32_e32 v25, 0xff, v47
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 24, v36
 ; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
 ; SI-NEXT:    v_or_b32_e32 v25, v26, v25
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v25
 ; SI-NEXT:    v_add_i32_e32 v25, vcc, 32, v0
 ; SI-NEXT:    buffer_store_dword v23, v25, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v23, 0xff, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 8, v45
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v24, 8, v24
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v24
-; SI-NEXT:    v_and_b32_e32 v24, 0xff, v43
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 24, v25
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v24, 0xff, v24
 ; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 24, v41
 ; SI-NEXT:    v_or_b32_e32 v24, v25, v24
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v24
 ; SI-NEXT:    v_add_i32_e32 v24, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v23, v24, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 8, v23
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 8, v35
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v23
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v23, 0xff, v23
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v24, 24, v24
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v23, 0xff, v23
 ; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
 ; SI-NEXT:    v_or_b32_e32 v23, v24, v23
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v23
@@ -113423,28 +115385,37 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v21, v23, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v21, 0xff, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 8, v55
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 8, v22
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v22
-; SI-NEXT:    v_and_b32_e32 v22, 0xff, v53
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 24, v23
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v22, 0xff, v22
 ; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 24, v51
 ; SI-NEXT:    v_or_b32_e32 v22, v23, v22
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v21
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v22
 ; SI-NEXT:    v_add_i32_e32 v22, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v21, v22, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 8, v21
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v21
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v21, 0xff, v21
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v22, 24, v22
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v21, 0xff, v21
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
 ; SI-NEXT:    v_or_b32_e32 v21, v22, v21
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v21
@@ -113452,518 +115423,370 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v19, v21, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v19, 0xff, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 8, v49
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 8, v20
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_and_b32_e32 v20, 0xff, v39
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 24, v21
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v20, 0xff, v20
 ; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 24, v37
 ; SI-NEXT:    v_or_b32_e32 v20, v21, v20
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v20
 ; SI-NEXT:    v_add_i32_e32 v20, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 8, v19
-; SI-NEXT:    v_or_b32_e32 v1, v1, v19
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v17, v17, v19
+; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v19, 0xff, v19
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v20, 24, v20
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v19, 0xff, v19
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
 ; SI-NEXT:    v_or_b32_e32 v19, v20, v19
-; SI-NEXT:    v_or_b32_e32 v1, v1, v19
+; SI-NEXT:    v_or_b32_e32 v17, v17, v19
 ; SI-NEXT:    v_add_i32_e32 v19, vcc, 56, v0
-; SI-NEXT:    buffer_store_dword v1, v19, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v17, v19, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v17, 0xff, v18
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
+; SI-NEXT:    v_or_b32_e32 v17, v17, v18
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_lshlrev_b32_e32 v19, 24, v19
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v19, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v18, 0xff, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; SI-NEXT:    v_or_b32_e32 v18, v19, v18
+; SI-NEXT:    v_or_b32_e32 v17, v17, v18
+; SI-NEXT:    v_add_i32_e32 v18, vcc, 60, v0
+; SI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
+; SI-NEXT:    v_or_b32_e32 v15, v15, v17
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v17, 0xff, v17
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 24, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; SI-NEXT:    v_or_b32_e32 v17, v18, v17
+; SI-NEXT:    v_or_b32_e32 v15, v15, v17
+; SI-NEXT:    v_add_i32_e32 v17, vcc, 64, v0
+; SI-NEXT:    buffer_store_dword v15, v17, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v4
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v15, 0xff, v16
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v16, 8, v16
+; SI-NEXT:    v_or_b32_e32 v15, v15, v16
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 24, v17
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; SI-NEXT:    v_or_b32_e32 v16, v17, v16
+; SI-NEXT:    v_or_b32_e32 v15, v15, v16
+; SI-NEXT:    v_add_i32_e32 v16, vcc, 0x44, v0
+; SI-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v5
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
+; SI-NEXT:    v_or_b32_e32 v13, v13, v15
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v15, 0xff, v15
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x48, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v16, 24, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; SI-NEXT:    v_or_b32_e32 v15, v16, v15
+; SI-NEXT:    v_or_b32_e32 v13, v13, v15
+; SI-NEXT:    v_add_i32_e32 v15, vcc, 0x48, v0
+; SI-NEXT:    buffer_store_dword v13, v15, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v6
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v13, 0xff, v14
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
+; SI-NEXT:    v_or_b32_e32 v13, v13, v14
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 24, v15
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_or_b32_e32 v14, v15, v14
+; SI-NEXT:    v_or_b32_e32 v13, v13, v14
+; SI-NEXT:    v_add_i32_e32 v14, vcc, 0x4c, v0
+; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v7
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
+; SI-NEXT:    v_or_b32_e32 v11, v11, v13
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v13, 0xff, v13
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x50, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 24, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; SI-NEXT:    v_or_b32_e32 v13, v14, v13
+; SI-NEXT:    v_or_b32_e32 v11, v11, v13
+; SI-NEXT:    v_add_i32_e32 v13, vcc, 0x50, v0
+; SI-NEXT:    buffer_store_dword v11, v13, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v8
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v11, 0xff, v12
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
+; SI-NEXT:    v_or_b32_e32 v11, v11, v12
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; SI-NEXT:    v_or_b32_e32 v12, v13, v12
+; SI-NEXT:    v_or_b32_e32 v11, v11, v12
+; SI-NEXT:    v_add_i32_e32 v12, vcc, 0x54, v0
+; SI-NEXT:    buffer_store_dword v11, v12, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v9
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
+; SI-NEXT:    v_or_b32_e32 v9, v9, v11
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
+; SI-NEXT:    v_or_b32_e32 v11, v12, v11
+; SI-NEXT:    v_or_b32_e32 v9, v9, v11
+; SI-NEXT:    v_add_i32_e32 v11, vcc, 0x58, v0
+; SI-NEXT:    buffer_store_dword v9, v11, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v10
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v9, 0xff, v10
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
+; SI-NEXT:    v_or_b32_e32 v9, v9, v10
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v11
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_or_b32_e32 v10, v11, v10
+; SI-NEXT:    v_or_b32_e32 v9, v9, v10
+; SI-NEXT:    v_add_i32_e32 v10, vcc, 0x5c, v0
+; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v12
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; SI-NEXT:    v_or_b32_e32 v7, v7, v9
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; SI-NEXT:    v_or_b32_e32 v9, v10, v9
+; SI-NEXT:    v_or_b32_e32 v7, v7, v9
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 0x60, v0
+; SI-NEXT:    buffer_store_dword v7, v9, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v13
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
+; SI-NEXT:    v_or_b32_e32 v7, v7, v8
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT:    v_or_b32_e32 v8, v9, v8
+; SI-NEXT:    v_or_b32_e32 v7, v7, v8
+; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x64, v0
+; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v14
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
+; SI-NEXT:    v_or_b32_e32 v5, v5, v7
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; SI-NEXT:    v_or_b32_e32 v7, v8, v7
+; SI-NEXT:    v_or_b32_e32 v5, v5, v7
+; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x68, v0
+; SI-NEXT:    buffer_store_dword v5, v7, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v15
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_and_b32_e32 v5, 0xff, v6
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; SI-NEXT:    v_or_b32_e32 v5, v5, v6
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT:    v_or_b32_e32 v6, v7, v6
+; SI-NEXT:    v_or_b32_e32 v5, v5, v6
+; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x6c, v0
+; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v16
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; SI-NEXT:    v_or_b32_e32 v3, v3, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; SI-NEXT:    v_or_b32_e32 v5, v6, v5
+; SI-NEXT:    v_or_b32_e32 v3, v3, v5
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x70, v0
+; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v17
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v30
+; SI-NEXT:    v_or_b32_e32 v3, v3, v4
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v33
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v57
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT:    v_or_b32_e32 v4, v5, v4
+; SI-NEXT:    v_or_b32_e32 v3, v3, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x74, v0
+; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v18
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; SI-NEXT:    v_or_b32_e32 v3, v4, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x78, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v43
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v36
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v51
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v37
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
-; SI-NEXT:  .LBB73_4:
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr46
-; SI-NEXT:    ; implicit-def: $vgpr44
-; SI-NEXT:    ; implicit-def: $vgpr42
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr34
-; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; implicit-def: $vgpr40
-; SI-NEXT:    ; implicit-def: $vgpr54
-; SI-NEXT:    ; implicit-def: $vgpr52
-; SI-NEXT:    ; implicit-def: $vgpr63
-; SI-NEXT:    ; implicit-def: $vgpr62
-; SI-NEXT:    ; implicit-def: $vgpr61
-; SI-NEXT:    ; implicit-def: $vgpr50
-; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    ; implicit-def: $vgpr38
-; SI-NEXT:    ; implicit-def: $vgpr60
-; SI-NEXT:    ; implicit-def: $vgpr59
-; SI-NEXT:    ; implicit-def: $vgpr58
-; SI-NEXT:    ; implicit-def: $vgpr57
-; SI-NEXT:    ; implicit-def: $vgpr56
-; SI-NEXT:    ; implicit-def: $vgpr47
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr43
-; SI-NEXT:    ; implicit-def: $vgpr41
-; SI-NEXT:    ; implicit-def: $vgpr55
-; SI-NEXT:    ; implicit-def: $vgpr53
-; SI-NEXT:    ; implicit-def: $vgpr51
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    ; implicit-def: $vgpr37
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; kill: killed $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    s_branch .LBB73_2
 ;
 ; VI-LABEL: bitcast_v16f64_to_v128i8_scalar:
 ; VI:       ; %bb.0:
@@ -140462,219 +142285,224 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB85_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_alignbit_b32 v33, v18, v17, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v34, v16, v15, 16
-; SI-NEXT:    v_alignbit_b32 v35, v14, v13, 16
-; SI-NEXT:    v_alignbit_b32 v36, v12, v11, 16
-; SI-NEXT:    v_alignbit_b32 v37, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v38, v8, v7, 16
-; SI-NEXT:    v_alignbit_b32 v39, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v49, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v51, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v54, v20, v19, 16
-; SI-NEXT:    v_alignbit_b32 v40, v22, v21, 16
-; SI-NEXT:    v_alignbit_b32 v42, v24, v23, 16
-; SI-NEXT:    v_alignbit_b32 v45, v26, v25, 16
-; SI-NEXT:    v_alignbit_b32 v47, v28, v27, 16
-; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    v_alignbit_b32 v58, v30, v29, 16
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_alignbit_b32 v60, v32, v31, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v24
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v26
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v28
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v30
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v32
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[33:34], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[34:35], v[15:16], 16
+; SI-NEXT:    v_lshr_b64 v[35:36], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[36:37], v[11:12], 16
+; SI-NEXT:    v_lshr_b64 v[37:38], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[38:39], v[7:8], 16
+; SI-NEXT:    v_lshr_b64 v[39:40], v[23:24], 16
+; SI-NEXT:    v_lshr_b64 v[40:41], v[25:26], 16
+; SI-NEXT:    v_lshr_b64 v[41:42], v[27:28], 16
+; SI-NEXT:    v_lshr_b64 v[51:52], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[48:49], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[42:43], v[29:30], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v28
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v30
+; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v32
+; SI-NEXT:    v_lshr_b64 v[54:55], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[49:50], v[19:20], 16
+; SI-NEXT:    v_lshr_b64 v[52:53], v[21:22], 16
+; SI-NEXT:    v_lshr_b64 v[43:44], v[31:32], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB85_3
 ; SI-NEXT:  .LBB85_2: ; %cmp.true
 ; SI-NEXT:    v_add_f64 v[17:18], v[17:18], 1.0
-; SI-NEXT:    v_add_f64 v[31:32], v[31:32], 1.0
-; SI-NEXT:    v_add_f64 v[29:30], v[29:30], 1.0
-; SI-NEXT:    v_add_f64 v[27:28], v[27:28], 1.0
-; SI-NEXT:    v_add_f64 v[25:26], v[25:26], 1.0
-; SI-NEXT:    v_add_f64 v[23:24], v[23:24], 1.0
-; SI-NEXT:    v_add_f64 v[21:22], v[21:22], 1.0
-; SI-NEXT:    v_add_f64 v[19:20], v[19:20], 1.0
-; SI-NEXT:    v_add_f64 v[1:2], v[1:2], 1.0
-; SI-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
-; SI-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
-; SI-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
-; SI-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
-; SI-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
-; SI-NEXT:    v_add_f64 v[13:14], v[13:14], 1.0
 ; SI-NEXT:    v_add_f64 v[15:16], v[15:16], 1.0
-; SI-NEXT:    v_alignbit_b32 v33, v18, v17, 16
+; SI-NEXT:    v_add_f64 v[13:14], v[13:14], 1.0
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
+; SI-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
 ; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v34, v16, v15, 16
-; SI-NEXT:    v_alignbit_b32 v35, v14, v13, 16
-; SI-NEXT:    v_alignbit_b32 v36, v12, v11, 16
-; SI-NEXT:    v_alignbit_b32 v37, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v38, v8, v7, 16
-; SI-NEXT:    v_alignbit_b32 v39, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v49, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v51, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v54, v20, v19, 16
-; SI-NEXT:    v_alignbit_b32 v40, v22, v21, 16
-; SI-NEXT:    v_alignbit_b32 v42, v24, v23, 16
-; SI-NEXT:    v_alignbit_b32 v45, v26, v25, 16
-; SI-NEXT:    v_alignbit_b32 v47, v28, v27, 16
-; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    v_alignbit_b32 v58, v30, v29, 16
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_alignbit_b32 v60, v32, v31, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v24
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v26
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v28
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v30
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v32
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
+; SI-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[33:34], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[34:35], v[15:16], 16
+; SI-NEXT:    v_lshr_b64 v[35:36], v[13:14], 16
+; SI-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
+; SI-NEXT:    v_lshr_b64 v[36:37], v[11:12], 16
+; SI-NEXT:    v_add_f64 v[23:24], v[23:24], 1.0
+; SI-NEXT:    v_lshr_b64 v[37:38], v[9:10], 16
+; SI-NEXT:    v_add_f64 v[25:26], v[25:26], 1.0
+; SI-NEXT:    v_lshr_b64 v[38:39], v[7:8], 16
+; SI-NEXT:    v_add_f64 v[27:28], v[27:28], 1.0
+; SI-NEXT:    v_lshr_b64 v[39:40], v[23:24], 16
+; SI-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
+; SI-NEXT:    v_add_f64 v[1:2], v[1:2], 1.0
+; SI-NEXT:    v_add_f64 v[29:30], v[29:30], 1.0
+; SI-NEXT:    v_lshr_b64 v[40:41], v[25:26], 16
+; SI-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
+; SI-NEXT:    v_add_f64 v[19:20], v[19:20], 1.0
+; SI-NEXT:    v_add_f64 v[21:22], v[21:22], 1.0
+; SI-NEXT:    v_add_f64 v[31:32], v[31:32], 1.0
+; SI-NEXT:    v_lshr_b64 v[41:42], v[27:28], 16
+; SI-NEXT:    v_lshr_b64 v[51:52], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[48:49], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[42:43], v[29:30], 16
+; SI-NEXT:    v_lshr_b64 v[54:55], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[49:50], v[19:20], 16
+; SI-NEXT:    v_lshr_b64 v[52:53], v[21:22], 16
+; SI-NEXT:    v_lshr_b64 v[43:44], v[31:32], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v28
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v30
+; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v32
 ; SI-NEXT:  .LBB85_3: ; %end
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v60
+; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v43
 ; SI-NEXT:    v_and_b32_e32 v31, 0xffff, v31
-; SI-NEXT:    v_or_b32_e32 v31, v31, v60
+; SI-NEXT:    v_or_b32_e32 v31, v31, v50
 ; SI-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v31, 0xffff, v32
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v33
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v57
 ; SI-NEXT:    v_or_b32_e32 v31, v31, v32
 ; SI-NEXT:    v_add_i32_e32 v32, vcc, 4, v0
 ; SI-NEXT:    buffer_store_dword v31, v32, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v58
+; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v42
 ; SI-NEXT:    v_and_b32_e32 v29, 0xffff, v29
 ; SI-NEXT:    v_or_b32_e32 v29, v29, v31
 ; SI-NEXT:    v_add_i32_e32 v31, vcc, 8, v0
 ; SI-NEXT:    buffer_store_dword v29, v31, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v29, 0xffff, v30
-; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v63
+; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v56
 ; SI-NEXT:    v_or_b32_e32 v29, v29, v30
 ; SI-NEXT:    v_add_i32_e32 v30, vcc, 12, v0
 ; SI-NEXT:    buffer_store_dword v29, v30, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v27
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v47
+; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v41
+; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v27
 ; SI-NEXT:    v_or_b32_e32 v27, v27, v29
 ; SI-NEXT:    v_add_i32_e32 v29, vcc, 16, v0
 ; SI-NEXT:    buffer_store_dword v27, v29, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v62
+; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v47
 ; SI-NEXT:    v_or_b32_e32 v27, v27, v28
 ; SI-NEXT:    v_add_i32_e32 v28, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v27, v28, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v25
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v45
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v40
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v25
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v27
 ; SI-NEXT:    v_add_i32_e32 v27, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v25, v27, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v61
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v46
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v26
 ; SI-NEXT:    v_add_i32_e32 v26, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v25, v26, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v42
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v39
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v25
 ; SI-NEXT:    v_add_i32_e32 v25, vcc, 32, v0
 ; SI-NEXT:    buffer_store_dword v23, v25, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v59
+; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v45
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v24
 ; SI-NEXT:    v_add_i32_e32 v24, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v23, v24, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v21
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v40
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v52
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v23
 ; SI-NEXT:    v_add_i32_e32 v23, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v21, v23, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v57
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v63
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v22
 ; SI-NEXT:    v_add_i32_e32 v22, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v21, v22, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v54
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v49
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v21
 ; SI-NEXT:    v_add_i32_e32 v21, vcc, 48, v0
 ; SI-NEXT:    buffer_store_dword v19, v21, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v56
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v62
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v20
 ; SI-NEXT:    v_add_i32_e32 v20, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v51
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v48
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v19
 ; SI-NEXT:    v_add_i32_e32 v19, vcc, 56, v0
 ; SI-NEXT:    buffer_store_dword v1, v19, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v46
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v61
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v49
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v54
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v44
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v60
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v39
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v51
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x48, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v43
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v59
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -140686,7 +142514,7 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v58
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -140696,9 +142524,11 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v55
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -140708,9 +142538,11 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v53
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -140720,9 +142552,11 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v52
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -140732,25 +142566,27 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v50
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v17
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v33
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v48
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -140771,39 +142607,43 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a,
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB85_4:
-; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; implicit-def: $vgpr58
-; SI-NEXT:    ; implicit-def: $vgpr63
+; SI-NEXT:    ; kill: killed $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; kill: killed $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr43
+; SI-NEXT:    ; implicit-def: $vgpr57
+; SI-NEXT:    ; implicit-def: $vgpr42
+; SI-NEXT:    ; implicit-def: $vgpr56
+; SI-NEXT:    ; implicit-def: $vgpr41
 ; SI-NEXT:    ; implicit-def: $vgpr47
-; SI-NEXT:    ; implicit-def: $vgpr62
+; SI-NEXT:    ; implicit-def: $vgpr40
+; SI-NEXT:    ; implicit-def: $vgpr46
+; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; implicit-def: $vgpr52
+; SI-NEXT:    ; implicit-def: $vgpr63
+; SI-NEXT:    ; implicit-def: $vgpr49
+; SI-NEXT:    ; implicit-def: $vgpr62
+; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr61
-; SI-NEXT:    ; implicit-def: $vgpr42
+; SI-NEXT:    ; implicit-def: $vgpr60
 ; SI-NEXT:    ; implicit-def: $vgpr59
-; SI-NEXT:    ; implicit-def: $vgpr40
-; SI-NEXT:    ; implicit-def: $vgpr57
+; SI-NEXT:    ; implicit-def: $vgpr58
 ; SI-NEXT:    ; implicit-def: $vgpr54
-; SI-NEXT:    ; implicit-def: $vgpr56
 ; SI-NEXT:    ; implicit-def: $vgpr51
-; SI-NEXT:    ; implicit-def: $vgpr46
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; implicit-def: $vgpr44
-; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr38
-; SI-NEXT:    ; implicit-def: $vgpr41
 ; SI-NEXT:    ; implicit-def: $vgpr37
-; SI-NEXT:    ; implicit-def: $vgpr55
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr53
 ; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr52
 ; SI-NEXT:    ; implicit-def: $vgpr34
-; SI-NEXT:    ; implicit-def: $vgpr50
-; SI-NEXT:    ; kill: killed $vgpr48
-; SI-NEXT:    ; implicit-def: $vgpr48
+; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; kill: killed $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; kill: killed $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; kill: killed $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_branch .LBB85_2
 ;
 ; VI-LABEL: bitcast_v16f64_to_v64i16_scalar:
@@ -162237,22 +164077,24 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; SI-LABEL: bitcast_v64bf16_to_v128i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32
@@ -162265,596 +164107,622 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:36
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:32
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:72
+; SI-NEXT:    v_writelane_b32 v63, s30, 0
+; SI-NEXT:    v_writelane_b32 v63, s31, 1
+; SI-NEXT:    v_writelane_b32 v63, s34, 2
+; SI-NEXT:    v_writelane_b32 v63, s35, 3
+; SI-NEXT:    v_writelane_b32 v63, s36, 4
+; SI-NEXT:    v_writelane_b32 v63, s37, 5
+; SI-NEXT:    v_writelane_b32 v63, s38, 6
+; SI-NEXT:    v_writelane_b32 v63, s39, 7
+; SI-NEXT:    v_writelane_b32 v63, s48, 8
+; SI-NEXT:    v_writelane_b32 v63, s49, 9
+; SI-NEXT:    v_writelane_b32 v63, s50, 10
+; SI-NEXT:    v_writelane_b32 v63, s51, 11
+; SI-NEXT:    v_writelane_b32 v63, s52, 12
+; SI-NEXT:    v_writelane_b32 v63, s53, 13
+; SI-NEXT:    v_writelane_b32 v63, s54, 14
+; SI-NEXT:    v_writelane_b32 v63, s55, 15
+; SI-NEXT:    v_writelane_b32 v63, s64, 16
+; SI-NEXT:    v_writelane_b32 v63, s65, 17
+; SI-NEXT:    v_writelane_b32 v63, s66, 18
+; SI-NEXT:    v_writelane_b32 v63, s67, 19
+; SI-NEXT:    v_writelane_b32 v63, s68, 20
+; SI-NEXT:    v_writelane_b32 v63, s69, 21
+; SI-NEXT:    v_writelane_b32 v63, s70, 22
+; SI-NEXT:    v_writelane_b32 v63, s71, 23
+; SI-NEXT:    v_writelane_b32 v63, s80, 24
+; SI-NEXT:    v_writelane_b32 v63, s81, 25
+; SI-NEXT:    v_writelane_b32 v63, s82, 26
+; SI-NEXT:    v_writelane_b32 v63, s83, 27
 ; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:72
-; SI-NEXT:    v_mov_b32_e32 v42, v22
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_mul_f32_e32 v60, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v18
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v10
+; SI-NEXT:    v_writelane_b32 v63, s84, 28
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v42
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v59, v26
-; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v2
-; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT:    v_mul_f32_e32 v22, 1.0, v6
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v9
+; SI-NEXT:    v_writelane_b32 v63, s85, 29
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v14
+; SI-NEXT:    v_writelane_b32 v63, s86, 30
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v15
+; SI-NEXT:    v_writelane_b32 v63, s87, 31
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v18
+; SI-NEXT:    v_writelane_b32 v63, s96, 32
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v26
+; SI-NEXT:    v_writelane_b32 v63, s97, 33
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT:    v_writelane_b32 v63, s98, 34
+; SI-NEXT:    v_mov_b32_e32 v46, v21
+; SI-NEXT:    v_writelane_b32 v63, s99, 35
+; SI-NEXT:    v_mul_f32_e32 v47, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v4
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v3
+; SI-NEXT:    v_mul_f32_e32 v58, 1.0, v6
 ; SI-NEXT:    v_mul_f32_e32 v41, 1.0, v5
-; SI-NEXT:    v_mul_f32_e32 v57, 1.0, v8
-; SI-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v10
-; SI-NEXT:    v_mul_f32_e32 v26, 1.0, v9
-; SI-NEXT:    v_mul_f32_e32 v61, 1.0, v12
-; SI-NEXT:    v_mul_f32_e32 v63, 1.0, v11
-; SI-NEXT:    v_mul_f32_e32 v11, 1.0, v14
-; SI-NEXT:    v_mul_f32_e32 v52, 1.0, v13
-; SI-NEXT:    v_mul_f32_e32 v40, 1.0, v15
-; SI-NEXT:    v_mul_f32_e32 v18, 1.0, v17
-; SI-NEXT:    v_mul_f32_e32 v17, 1.0, v20
-; SI-NEXT:    v_mul_f32_e32 v19, 1.0, v19
-; SI-NEXT:    v_mul_f32_e32 v21, 1.0, v21
-; SI-NEXT:    v_mul_f32_e32 v20, 1.0, v24
-; SI-NEXT:    v_mul_f32_e32 v42, 1.0, v23
-; SI-NEXT:    v_mul_f32_e32 v23, 1.0, v59
-; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v25
-; SI-NEXT:    v_mul_f32_e32 v25, 1.0, v28
-; SI-NEXT:    v_mul_f32_e32 v27, 1.0, v27
-; SI-NEXT:    v_mul_f32_e32 v59, 1.0, v30
-; SI-NEXT:    v_mul_f32_e32 v29, 1.0, v29
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v14, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v9, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v6, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v12, 1.0, s22
-; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s25
-; SI-NEXT:    v_mul_f32_e64 v13, 1.0, s24
-; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s26
-; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s29
+; SI-NEXT:    v_mul_f32_e32 v59, 1.0, v8
+; SI-NEXT:    v_mul_f32_e32 v61, 1.0, v7
+; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v12
+; SI-NEXT:    v_mul_f32_e32 v60, 1.0, v11
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v16
-; SI-NEXT:    v_mul_f32_e32 v30, 1.0, v33
-; SI-NEXT:    v_mul_f32_e32 v28, 1.0, v34
+; SI-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; SI-NEXT:    v_mul_f32_e32 v21, 1.0, v16
+; SI-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; SI-NEXT:    v_mul_f32_e32 v9, 1.0, v20
+; SI-NEXT:    v_mul_f32_e32 v12, 1.0, v19
+; SI-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; SI-NEXT:    v_mul_f32_e32 v20, 1.0, v46
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v36
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v48
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; SI-NEXT:    v_mul_f32_e32 v46, 1.0, v23
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v52
+; SI-NEXT:    v_mul_f32_e32 v26, 1.0, v25
+; SI-NEXT:    v_mul_f32_e32 v57, 1.0, v28
+; SI-NEXT:    v_mul_f32_e32 v16, 1.0, v27
+; SI-NEXT:    v_mul_f32_e32 v28, 1.0, v30
+; SI-NEXT:    v_mul_f32_e32 v30, 1.0, v29
+; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v33
+; SI-NEXT:    v_mul_f32_e32 v27, 1.0, v34
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mul_f32_e32 v34, 1.0, v35
+; SI-NEXT:    v_mul_f32_e32 v29, 1.0, v35
+; SI-NEXT:    v_mul_f32_e32 v36, 1.0, v36
 ; SI-NEXT:    v_mul_f32_e32 v35, 1.0, v37
-; SI-NEXT:    v_mul_f32_e32 v38, 1.0, v38
-; SI-NEXT:    v_mul_f32_e32 v39, 1.0, v39
-; SI-NEXT:    v_mul_f32_e32 v37, 1.0, v48
+; SI-NEXT:    v_mul_f32_e32 v34, 1.0, v38
+; SI-NEXT:    v_mul_f32_e32 v37, 1.0, v39
+; SI-NEXT:    v_mul_f32_e32 v48, 1.0, v49
+; SI-NEXT:    v_mul_f32_e32 v39, 1.0, v50
+; SI-NEXT:    v_mul_f32_e32 v33, 1.0, v51
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_mul_f32_e32 v33, 1.0, v49
+; SI-NEXT:    v_mul_f32_e32 v51, 1.0, v53
+; SI-NEXT:    v_mul_f32_e32 v50, 1.0, v54
 ; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_mul_f32_e32 v50, 1.0, v50
+; SI-NEXT:    v_mul_f32_e32 v49, 1.0, v55
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_mul_f32_e32 v48, 1.0, v51
+; SI-NEXT:    v_mul_f32_e32 v38, 1.0, v40
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_mul_f32_e32 v51, 1.0, v53
-; SI-NEXT:    s_waitcnt vmcnt(7) expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v44
-; SI-NEXT:    v_mul_f32_e32 v36, 1.0, v54
-; SI-NEXT:    v_mul_f32_e32 v55, 1.0, v55
-; SI-NEXT:    v_mul_f32_e32 v62, 1.0, v43
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_mul_f32_e32 v44, 1.0, v45
-; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_mul_f32_e32 v46, 1.0, v46
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_mul_f32_e32 v53, 1.0, v47
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_mul_f32_e32 v43, 1.0, v58
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v49, 1.0, s23
-; SI-NEXT:    v_mul_f32_e64 v54, 1.0, s27
-; SI-NEXT:    v_mul_f32_e64 v16, 1.0, s28
-; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v55, 1.0, v42
+; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    v_mul_f32_e32 v54, 1.0, v43
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_mul_f32_e32 v43, 1.0, v44
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_mul_f32_e32 v53, 1.0, v45
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v14, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v6, 1.0, s22
+; SI-NEXT:    v_mul_f32_e64 v18, 1.0, s25
+; SI-NEXT:    v_mul_f32_e64 v19, 1.0, s24
+; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s27
+; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s26
+; SI-NEXT:    v_mul_f32_e64 v23, 1.0, s29
+; SI-NEXT:    v_mul_f32_e64 v25, 1.0, s28
+; SI-NEXT:    ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB91_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_alignbit_b32 v2, v1, v2, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
-; SI-NEXT:    v_alignbit_b32 v5, v1, v6, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v10
-; SI-NEXT:    v_alignbit_b32 v6, v1, v13, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
-; SI-NEXT:    v_alignbit_b32 v10, v1, v16, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; SI-NEXT:    v_alignbit_b32 v4, v1, v3, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v57
-; SI-NEXT:    v_alignbit_b32 v3, v1, v7, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v61
-; SI-NEXT:    v_alignbit_b32 v16, v1, v63, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v31
-; SI-NEXT:    v_alignbit_b32 v13, v1, v40, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v17
-; SI-NEXT:    v_alignbit_b32 v61, v1, v19, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v20
-; SI-NEXT:    v_alignbit_b32 v7, v1, v42, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v25
-; SI-NEXT:    v_alignbit_b32 v63, v1, v27, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v30
-; SI-NEXT:    v_alignbit_b32 v45, v1, v28, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v35
-; SI-NEXT:    v_alignbit_b32 v8, v1, v38, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v33
-; SI-NEXT:    v_alignbit_b32 v33, v1, v50, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v36
-; SI-NEXT:    v_mov_b32_e32 v57, v62
-; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v14
-; SI-NEXT:    v_alignbit_b32 v28, v1, v55, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v44
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_alignbit_b32 v44, v62, v9, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v56
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v55, v9, v26, 16
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v11
-; SI-NEXT:    v_alignbit_b32 v38, v9, v52, 16
-; SI-NEXT:    v_mov_b32_e32 v52, v59
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v49
-; SI-NEXT:    v_alignbit_b32 v1, v1, v46, 16
-; SI-NEXT:    v_alignbit_b32 v46, v47, v12, 16
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v36, v8
-; SI-NEXT:    v_alignbit_b32 v8, v44, v2, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v44, v2, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v44, v2, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v46, v5, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v46, v5, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v46, v5, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v54
-; SI-NEXT:    v_alignbit_b32 v19, v8, v15, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v19, v6, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v19, v6, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 24, v11
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v19, v6, 8
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 8, v38
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v32
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v50, v8, v60, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v50, v10, 24
-; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v22
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v50, v10, 8
-; SI-NEXT:    v_alignbit_b32 v25, v60, v41, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v25, v4, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v25, v4, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v25, v4, 8
-; SI-NEXT:    v_mov_b32_e32 v17, v32
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v55, v3, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v55, v3, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v55, v3, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v38, v16, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v38, v16, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v38, v16, 8
-; SI-NEXT:    v_alignbit_b32 v20, v50, v10, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v14
-; SI-NEXT:    s_mov_b64 s[4:5], 0
-; SI-NEXT:    v_mov_b32_e32 v58, v41
-; SI-NEXT:    v_mov_b32_e32 v40, v26
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 8, v44
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v59
-; SI-NEXT:    v_alignbit_b32 v35, v9, v18, 16
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v9, v35, v13, 24
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v9, v35, v13, 16
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v42
-; SI-NEXT:    v_alignbit_b32 v30, v9, v21, 16
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v9, v30, v61, 24
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v9, v30, v61, 16
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v9, v30, v61, 8
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v23
-; SI-NEXT:    v_alignbit_b32 v27, v9, v24, 16
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v9, v27, v7, 24
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v9, v27, v7, 16
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v9, v27, v7, 8
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v52
-; SI-NEXT:    v_alignbit_b32 v24, v9, v29, 16
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v3
+; SI-NEXT:    v_mov_b32_e32 v42, v37
+; SI-NEXT:    v_alignbit_b32 v37, v2, v11, 16
+; SI-NEXT:    v_alignbit_b32 v11, v44, v4, 16
+; SI-NEXT:    v_readfirstlane_b32 s4, v37
+; SI-NEXT:    v_readfirstlane_b32 s5, v11
+; SI-NEXT:    s_lshr_b64 s[6:7], s[4:5], 24
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v14
+; SI-NEXT:    v_writelane_b32 v62, s6, 0
+; SI-NEXT:    v_alignbit_b32 v2, v2, v15, 16
+; SI-NEXT:    v_writelane_b32 v62, s7, 1
+; SI-NEXT:    s_lshr_b64 s[6:7], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[4:5], 8
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    v_readfirstlane_b32 s4, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v7
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v9, v24, v63, 16
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v18
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT:    v_alignbit_b32 v14, v52, v6, 16
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT:    v_readfirstlane_b32 s5, v14
+; SI-NEXT:    v_alignbit_b32 v2, v2, v19, 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[12:13], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[16:17], s[4:5], 8
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    v_readfirstlane_b32 s4, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v9, v24, v63, 8
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v10
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v19, v2, v8, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v34
-; SI-NEXT:    v_alignbit_b32 v21, v9, v12, 16
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v23
+; SI-NEXT:    v_readfirstlane_b32 s5, v19
+; SI-NEXT:    v_alignbit_b32 v2, v2, v25, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v56
+; SI-NEXT:    s_lshr_b64 s[14:15], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[18:19], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[22:23], s[4:5], 8
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    v_readfirstlane_b32 s4, v2
+; SI-NEXT:    v_alignbit_b32 v47, v45, v47, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v9, v21, v45, 24
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v32
+; SI-NEXT:    v_readfirstlane_b32 s5, v47
+; SI-NEXT:    v_alignbit_b32 v2, v2, v1, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v58
+; SI-NEXT:    s_lshr_b64 s[20:21], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[24:25], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[28:29], s[4:5], 8
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    v_readfirstlane_b32 s4, v2
+; SI-NEXT:    v_mov_b32_e32 v4, v58
+; SI-NEXT:    v_alignbit_b32 v58, v8, v41, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v9, v21, v45, 8
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v59
+; SI-NEXT:    v_readfirstlane_b32 s5, v58
+; SI-NEXT:    v_alignbit_b32 v2, v2, v61, 16
+; SI-NEXT:    s_lshr_b64 s[26:27], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[40:41], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[4:5], 8
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    v_readfirstlane_b32 s4, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v39
-; SI-NEXT:    v_alignbit_b32 v18, v9, v37, 16
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    v_alignbit_b32 v2, v2, v60, 16
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v23, v22
+; SI-NEXT:    v_mov_b32_e32 v40, v36
+; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 24, v56
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v18
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_alignbit_b32 v41, v15, v6, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v41
+; SI-NEXT:    s_lshr_b64 s[42:43], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[46:47], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[4:5], 8
+; SI-NEXT:    v_readfirstlane_b32 s4, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v9, v18, v36, 24
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 24, v59
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v21
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v32
+; SI-NEXT:    v_alignbit_b32 v59, v1, v13, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT:    v_readfirstlane_b32 s5, v59
+; SI-NEXT:    s_waitcnt vmcnt(3) expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; SI-NEXT:    s_lshr_b64 s[56:57], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[60:61], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[72:73], s[4:5], 8
+; SI-NEXT:    v_alignbit_b32 v61, v1, v17, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v61
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v58
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_alignbit_b32 v2, v2, v21, 16
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    v_readfirstlane_b32 s4, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v9, v18, v36, 16
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
+; SI-NEXT:    v_alignbit_b32 v2, v2, v12, 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[74:75], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[78:79], s[4:5], 8
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    v_readfirstlane_b32 s4, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 8, v35
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v22
+; SI-NEXT:    v_alignbit_b32 v60, v2, v20, 16
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v9, v18, v36, 8
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v24
+; SI-NEXT:    v_alignbit_b32 v1, v2, v46, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v60
+; SI-NEXT:    s_lshr_b64 s[76:77], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[88:89], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[92:93], s[4:5], 8
+; SI-NEXT:    v_readfirstlane_b32 s4, v1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 24, v42
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v1, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v48
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v23
+; SI-NEXT:    v_mov_b32_e32 v5, v28
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v24, 24, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 8, v47
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 8, v41
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v61
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 8, v60
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v20
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v25, v2, v26, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 8, v30
-; SI-NEXT:    v_alignbit_b32 v15, v9, v51, 16
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v57
+; SI-NEXT:    v_readfirstlane_b32 s5, v25
+; SI-NEXT:    v_alignbit_b32 v2, v2, v16, 16
+; SI-NEXT:    s_lshr_b64 s[90:91], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[94:95], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[34:35], s[4:5], 8
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    v_readfirstlane_b32 s4, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 24, v23
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v28
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v22, v2, v30, 16
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v31
+; SI-NEXT:    v_readfirstlane_b32 s5, v22
+; SI-NEXT:    v_alignbit_b32 v2, v2, v27, 16
+; SI-NEXT:    s_lshr_b64 s[30:31], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[36:37], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[38:39], s[4:5], 8
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    v_readfirstlane_b32 s4, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v9, v15, v33, 24
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v29
+; SI-NEXT:    v_alignbit_b32 v17, v2, v36, 16
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 8, v27
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v35
+; SI-NEXT:    v_readfirstlane_b32 s5, v17
+; SI-NEXT:    v_alignbit_b32 v2, v2, v34, 16
+; SI-NEXT:    s_lshr_b64 s[48:49], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[50:51], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[52:53], s[4:5], 8
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    v_readfirstlane_b32 s4, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v9, v15, v33, 16
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v42
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v20
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 24, v52
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v5
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v9, v15, v33, 8
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v22
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 8, v24
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v29
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v29, v37
+; SI-NEXT:    v_mov_b32_e32 v37, v42
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v57
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 24, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 8, v50
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v17
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 24, v34
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v37
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 24, v22
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v33
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 24, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 24, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 8, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v26, 24, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 8, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 8, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 8, v59
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 24, v43
+; SI-NEXT:    v_mov_b32_e32 v31, v20
+; SI-NEXT:    v_mov_b32_e32 v20, v34
+; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    v_alignbit_b32 v30, v2, v36, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v48
+; SI-NEXT:    v_alignbit_b32 v2, v2, v39, 16
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT:    v_readfirstlane_b32 s5, v30
+; SI-NEXT:    s_lshr_b64 s[54:55], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[64:65], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[68:69], s[4:5], 8
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    v_readfirstlane_b32 s4, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 8, v21
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v33
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v28, v36
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_alignbit_b32 v57, v2, v39, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 8, v25
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v51
+; SI-NEXT:    v_readfirstlane_b32 s5, v57
+; SI-NEXT:    v_alignbit_b32 v2, v2, v50, 16
+; SI-NEXT:    s_lshr_b64 s[66:67], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[70:71], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[82:83], s[4:5], 8
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    v_readfirstlane_b32 s4, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 24, v39
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v49
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v46, v2, v38, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 24, v56
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v55
+; SI-NEXT:    v_readfirstlane_b32 s5, v46
+; SI-NEXT:    v_alignbit_b32 v2, v2, v54, 16
+; SI-NEXT:    s_lshr_b64 s[80:81], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[84:85], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[96:97], s[4:5], 8
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    v_readfirstlane_b32 s4, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 8, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v37, 24, v49
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v43
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v57
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v38, v2, v53, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 8, v55
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 24, v18
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 8, v15
-; SI-NEXT:    v_alignbit_b32 v8, v35, v13, 8
-; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v49
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v37, 24, v54
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v2, v32
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v24, v63, 24
-; SI-NEXT:    v_alignbit_b32 v31, v21, v45, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 8, v46
-; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v54, 8, v19
-; SI-NEXT:    v_mov_b32_e32 v39, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 24, v48
-; SI-NEXT:    v_lshrrev_b32_e32 v57, 24, v57
-; SI-NEXT:    v_mov_b32_e32 v52, v42
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_alignbit_b32 v12, v9, v12, 16
-; SI-NEXT:    v_alignbit_b32 v9, v12, v28, 8
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v46
+; SI-NEXT:    v_readfirstlane_b32 s5, v38
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v53
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v9, v9, v43, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 8, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 8, v9
-; SI-NEXT:    v_alignbit_b32 v41, v12, v28, 24
-; SI-NEXT:    v_alignbit_b32 v32, v12, v28, 16
-; SI-NEXT:    v_alignbit_b32 v29, v9, v1, 24
-; SI-NEXT:    v_alignbit_b32 v43, v9, v1, 16
-; SI-NEXT:    v_alignbit_b32 v26, v9, v1, 8
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
+; SI-NEXT:    s_lshr_b64 s[86:87], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[98:99], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], 8
+; SI-NEXT:    v_mov_b32_e32 v32, v8
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 8, v25
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 24, v53
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 8, v30
+; SI-NEXT:    v_mov_b32_e32 v55, v49
+; SI-NEXT:    v_mov_b32_e32 v49, v15
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v8, v6
 ; SI-NEXT:    s_branch .LBB91_3
 ; SI-NEXT:  .LBB91_2:
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr8
+; SI-NEXT:    ; kill: killed $vgpr8
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    v_mov_b32_e32 v55, v49
+; SI-NEXT:    ; implicit-def: $vgpr8
+; SI-NEXT:    ; kill: killed $vgpr8
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    v_writelane_b32 v62, s4, 0
+; SI-NEXT:    ; implicit-def: $vgpr8
+; SI-NEXT:    ; kill: killed $vgpr8
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    v_mov_b32_e32 v40, v36
+; SI-NEXT:    ; implicit-def: $vgpr8
+; SI-NEXT:    ; kill: killed $vgpr8
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    s_mov_b64 vcc, -1
+; SI-NEXT:    ; implicit-def: $vgpr8
+; SI-NEXT:    ; kill: killed $vgpr8
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    v_writelane_b32 v62, s5, 1
+; SI-NEXT:    ; implicit-def: $vgpr8
+; SI-NEXT:    ; kill: killed $vgpr8
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr29
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr44
+; SI-NEXT:    ; implicit-def: $vgpr16
+; SI-NEXT:    ; implicit-def: $sgpr16
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $vgpr14
+; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $vgpr52
+; SI-NEXT:    ; implicit-def: $vgpr26
+; SI-NEXT:    ; implicit-def: $sgpr22
+; SI-NEXT:    ; implicit-def: $sgpr18
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $vgpr19
+; SI-NEXT:    ; implicit-def: $vgpr27
+; SI-NEXT:    ; implicit-def: $vgpr24
+; SI-NEXT:    ; implicit-def: $sgpr28
+; SI-NEXT:    ; implicit-def: $sgpr24
+; SI-NEXT:    ; implicit-def: $sgpr20
+; SI-NEXT:    ; implicit-def: $vgpr47
+; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; implicit-def: $vgpr56
+; SI-NEXT:    ; implicit-def: $sgpr44
+; SI-NEXT:    ; implicit-def: $sgpr40
+; SI-NEXT:    ; implicit-def: $sgpr26
+; SI-NEXT:    ; implicit-def: $vgpr58
+; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr20
+; SI-NEXT:    ; implicit-def: $sgpr58
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr42
+; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    ; implicit-def: $vgpr12
+; SI-NEXT:    ; implicit-def: $vgpr49
+; SI-NEXT:    ; implicit-def: $sgpr72
+; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr56
+; SI-NEXT:    ; implicit-def: $vgpr59
+; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; implicit-def: $sgpr78
+; SI-NEXT:    ; implicit-def: $sgpr74
+; SI-NEXT:    ; implicit-def: $sgpr62
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr10
+; SI-NEXT:    ; implicit-def: $sgpr92
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    ; implicit-def: $sgpr76
+; SI-NEXT:    ; implicit-def: $vgpr60
+; SI-NEXT:    ; implicit-def: $vgpr23
+; SI-NEXT:    ; implicit-def: $sgpr34
+; SI-NEXT:    ; implicit-def: $sgpr94
+; SI-NEXT:    ; implicit-def: $sgpr90
+; SI-NEXT:    ; implicit-def: $vgpr25
+; SI-NEXT:    ; implicit-def: $vgpr18
+; SI-NEXT:    ; implicit-def: $sgpr38
+; SI-NEXT:    ; implicit-def: $sgpr36
+; SI-NEXT:    ; implicit-def: $sgpr30
+; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    ; implicit-def: $sgpr52
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $vgpr17
+; SI-NEXT:    ; implicit-def: $sgpr68
+; SI-NEXT:    ; implicit-def: $sgpr64
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; implicit-def: $sgpr82
+; SI-NEXT:    ; implicit-def: $sgpr70
+; SI-NEXT:    ; implicit-def: $sgpr66
+; SI-NEXT:    ; implicit-def: $sgpr96
+; SI-NEXT:    ; implicit-def: $sgpr84
+; SI-NEXT:    ; implicit-def: $sgpr80
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr98
+; SI-NEXT:    ; implicit-def: $sgpr86
+; SI-NEXT:    ; implicit-def: $vgpr30
+; SI-NEXT:    ; implicit-def: $vgpr57
+; SI-NEXT:    ; implicit-def: $vgpr46
+; SI-NEXT:    ; implicit-def: $vgpr38
+; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    ; kill: killed $vgpr36
+; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
@@ -162872,1143 +164740,1007 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    v_mov_b32_e32 v40, v26
-; SI-NEXT:    v_mov_b32_e32 v58, v41
-; SI-NEXT:    s_mov_b64 s[4:5], -1
+; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr44
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $vgpr62
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr46
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; implicit-def: $vgpr47
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr54
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    ; implicit-def: $vgpr50
-; SI-NEXT:    ; implicit-def: $vgpr51
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr60
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr55
-; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $vgpr38
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr61
-; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr63
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr41
-; SI-NEXT:    ; implicit-def: $vgpr12
 ; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr57
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr43
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; kill: killed $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; kill: killed $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:  .LBB91_3: ; %Flow
-; SI-NEXT:    v_mov_b32_e32 v56, v40
-; SI-NEXT:    v_mov_b32_e32 v20, v11
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v40, v8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT:    s_andn2_b64 vcc, exec, vcc
 ; SI-NEXT:    s_cbranch_vccnz .LBB91_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v43
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; SI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
-; SI-NEXT:    v_and_b32_e32 v40, 0xffff0000, v40
-; SI-NEXT:    v_add_f32_e32 v40, 0x40c00000, v40
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v43, 0xffff0000, v43
-; SI-NEXT:    v_add_f32_e32 v43, 0x40c00000, v43
-; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v43
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v36, 0xffff0000, v36
-; SI-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
-; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_alignbit_b32 v29, v3, v2, 16
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_alignbit_b32 v50, v4, v3, 16
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_alignbit_b32 v54, v5, v4, 16
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v5
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v5
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT:    v_alignbit_b32 v7, v3, v2, 16
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    v_readfirstlane_b32 s4, v7
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_alignbit_b32 v45, v7, v5, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v11
-; SI-NEXT:    v_alignbit_b32 v9, v7, v6, 16
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v6
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v57, 24, v14
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v37
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT:    v_alignbit_b32 v10, v6, v4, 16
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v33
+; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    v_readfirstlane_b32 s52, v10
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v51
 ; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
 ; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT:    v_alignbit_b32 v63, v10, v6, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v14
-; SI-NEXT:    v_alignbit_b32 v12, v10, v7, 16
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v48
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v7
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v7
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v48, 0xffff0000, v48
-; SI-NEXT:    v_add_f32_e32 v48, 0x40c00000, v48
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v48
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_alignbit_b32 v7, v13, v7, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v17
-; SI-NEXT:    v_alignbit_b32 v15, v13, v10, 16
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
-; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v10
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v8
-; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; SI-NEXT:    v_alignbit_b32 v61, v16, v10, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v20
-; SI-NEXT:    v_alignbit_b32 v18, v16, v13, 16
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v2
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v13
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
-; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT:    v_alignbit_b32 v13, v19, v13, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v8
-; SI-NEXT:    v_alignbit_b32 v21, v19, v16, 16
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 24, v8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_alignbit_b32 v9, v3, v2, 16
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    v_readfirstlane_b32 s86, v9
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 24, v20
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v20, 24, v11
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v16
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v16
-; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT:    v_alignbit_b32 v16, v22, v16, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v23
-; SI-NEXT:    v_alignbit_b32 v24, v22, v19, 16
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 24, v23
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 24, v17
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v50
+; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v19
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v19
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT:    v_alignbit_b32 v3, v25, v19, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v31
-; SI-NEXT:    v_alignbit_b32 v27, v25, v22, 16
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v22
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v52
-; SI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v22
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v32
-; SI-NEXT:    v_alignbit_b32 v30, v26, v25, 16
-; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_alignbit_b32 v12, v3, v2, 16
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    v_readfirstlane_b32 s80, v12
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
-; SI-NEXT:    v_alignbit_b32 v4, v28, v22, 16
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; SI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v25
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v59
-; SI-NEXT:    v_add_f32_e32 v39, 0x40c00000, v25
-; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v39
-; SI-NEXT:    v_alignbit_b32 v35, v34, v28, 16
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v33, 0xffff0000, v26
-; SI-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 24, v43
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v43, v9, v1, 16
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT:    v_alignbit_b32 v10, v33, v25, 16
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; SI-NEXT:    v_add_f32_e32 v33, 0x40c00000, v28
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; SI-NEXT:    v_add_f32_e32 v49, 0x40c00000, v28
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v49
-; SI-NEXT:    v_alignbit_b32 v38, v37, v33, 16
-; SI-NEXT:    v_and_b32_e32 v33, 0xffff0000, v56
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; SI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
-; SI-NEXT:    v_alignbit_b32 v6, v36, v28, 16
-; SI-NEXT:    v_add_f32_e32 v36, 0x40c00000, v33
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v28, v29
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; SI-NEXT:    v_add_f32_e32 v51, 0x40c00000, v33
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v51
-; SI-NEXT:    v_alignbit_b32 v55, v53, v36, 16
-; SI-NEXT:    v_and_b32_e32 v36, 0xffff0000, v58
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; SI-NEXT:    v_add_f32_e32 v33, 0x40c00000, v33
-; SI-NEXT:    v_alignbit_b32 v5, v48, v33, 16
-; SI-NEXT:    v_add_f32_e32 v48, 0x40c00000, v36
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v33, v50
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v36, 0xffff0000, v36
-; SI-NEXT:    v_add_f32_e32 v52, 0x40c00000, v36
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v52
-; SI-NEXT:    v_alignbit_b32 v25, v60, v48, 16
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_alignbit_b32 v13, v3, v2, 16
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    v_readfirstlane_b32 s66, v13
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v50, 0xffff0000, v50
-; SI-NEXT:    v_add_f32_e32 v50, 0x40c00000, v50
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v50
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v36, 0xffff0000, v36
-; SI-NEXT:    v_add_f32_e32 v36, 0x40c00000, v36
-; SI-NEXT:    v_alignbit_b32 v2, v50, v36, 16
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v36, v54
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v48, 0xffff0000, v48
-; SI-NEXT:    v_add_f32_e32 v48, 0x40c00000, v48
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v2
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v50, 0xffff0000, v50
-; SI-NEXT:    v_add_f32_e32 v54, 0x40c00000, v50
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v54
-; SI-NEXT:    v_alignbit_b32 v50, v19, v48, 16
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v40
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v29, 24, v54
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v48, 0xffff0000, v48
-; SI-NEXT:    v_add_f32_e32 v48, 0x40c00000, v48
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v19, v19, v48, 16
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v54, 8, v19
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v48, 0xffff0000, v48
-; SI-NEXT:    v_add_f32_e32 v41, 0x40c00000, v48
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v48, 0xffff0000, v48
-; SI-NEXT:    v_add_f32_e32 v42, 0x40c00000, v48
-; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v42
-; SI-NEXT:    v_alignbit_b32 v46, v47, v41, 16
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 24, v42
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 24, v40
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 24, v52
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v38, v4, v3, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 24, v51
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v55
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v15
+; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_alignbit_b32 v15, v7, v6, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v42
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v46, v6, v3, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 24, v49
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v48
+; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v39
+; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_alignbit_b32 v23, v7, v6, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v53
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v57, v6, v3, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 24, v39
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v54
+; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v28
+; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_alignbit_b32 v16, v7, v6, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v30, v6, v3, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 24, v32
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v21
+; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v40
+; SI-NEXT:    v_alignbit_b32 v18, v9, v7, 16
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v34
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT:    v_alignbit_b32 v20, v10, v9, 16
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    v_readfirstlane_b32 s38, v15
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    v_readfirstlane_b32 s90, v16
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 24, v31
-; SI-NEXT:    v_mov_b32_e32 v51, v29
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v39, v50, v10, 16
-; SI-NEXT:    v_alignbit_b32 v42, v25, v4, 8
-; SI-NEXT:    v_alignbit_b32 v40, v24, v63, 24
-; SI-NEXT:    v_alignbit_b32 v31, v21, v45, 16
-; SI-NEXT:    v_alignbit_b32 v32, v12, v28, 16
-; SI-NEXT:    v_alignbit_b32 v29, v9, v1, 24
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    v_readfirstlane_b32 s30, v23
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    v_readfirstlane_b32 s76, v18
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    v_readfirstlane_b32 s62, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v26, v9, v1, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 8, v46
-; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_and_b32_e32 v41, 0xffff0000, v41
-; SI-NEXT:    v_add_f32_e32 v41, 0x40c00000, v41
-; SI-NEXT:    v_alignbit_b32 v44, v62, v41, 16
-; SI-NEXT:    v_alignbit_b32 v8, v44, v2, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v5
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v46, v5, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v46, v5, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v46, v5, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v19, v6, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v19, v6, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v19, v6, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v50, v10, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v50, v10, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v25, v4, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v25, v4, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v55, v3, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v55, v3, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v55, v3, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v38, v16, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v38, v16, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v38, v16, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v35, v13, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v35, v13, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v35, v13, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v30, v61, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v30, v61, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v30, v61, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v27, v7, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v27, v7, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v27, v7, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v24, v63, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v24, v63, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v21, v45, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v21, v45, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v18, v36, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v18, v36, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v18, v36, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v15, v33, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v15, v33, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v15, v33, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v12, v28, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 8, v50
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 8, v25
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 8, v55
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 8, v38
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 8, v35
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 8, v30
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 24, v4
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    v_readfirstlane_b32 s5, v38
+; SI-NEXT:    v_readfirstlane_b32 s87, v46
+; SI-NEXT:    v_readfirstlane_b32 s81, v57
+; SI-NEXT:    v_readfirstlane_b32 s67, v30
+; SI-NEXT:    s_lshr_b64 s[54:55], s[66:67], 24
+; SI-NEXT:    s_lshr_b64 s[64:65], s[66:67], 16
+; SI-NEXT:    s_lshr_b64 s[68:69], s[66:67], 8
+; SI-NEXT:    s_lshr_b64 s[66:67], s[80:81], 24
+; SI-NEXT:    s_lshr_b64 s[70:71], s[80:81], 16
+; SI-NEXT:    s_lshr_b64 s[82:83], s[80:81], 8
+; SI-NEXT:    s_lshr_b64 s[80:81], s[86:87], 24
+; SI-NEXT:    s_lshr_b64 s[84:85], s[86:87], 16
+; SI-NEXT:    s_lshr_b64 s[96:97], s[86:87], 8
+; SI-NEXT:    s_lshr_b64 s[86:87], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[98:99], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 24, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 8, v30
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v17, v7, v3, 16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 8, v27
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
+; SI-NEXT:    v_readfirstlane_b32 s53, v17
+; SI-NEXT:    s_lshr_b64 s[48:49], s[52:53], 24
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT:    v_alignbit_b32 v21, v12, v10, 16
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT:    s_waitcnt vmcnt(13)
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; SI-NEXT:    v_readfirstlane_b32 s56, v21
+; SI-NEXT:    s_lshr_b64 s[50:51], s[52:53], 16
+; SI-NEXT:    s_lshr_b64 s[52:53], s[52:53], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 24, v6
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; SI-NEXT:    v_alignbit_b32 v22, v9, v3, 16
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT:    v_alignbit_b32 v23, v13, v12, 16
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 8, v24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v31
+; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v9
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    v_readfirstlane_b32 s42, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v22
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 8, v21
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v17
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 8, v18
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v57
+; SI-NEXT:    v_readfirstlane_b32 s39, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 24, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v7
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 8, v15
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v46
+; SI-NEXT:    s_lshr_b64 s[36:37], s[38:39], 16
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 8, v12
-; SI-NEXT:    v_alignbit_b32 v17, v44, v2, 24
-; SI-NEXT:    v_alignbit_b32 v11, v44, v2, 8
-; SI-NEXT:    v_alignbit_b32 v41, v12, v28, 24
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 8, v44
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v38
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; SI-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v21
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT:    v_alignbit_b32 v24, v15, v13, 16
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_alignbit_b32 v25, v10, v3, 16
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT:    v_readfirstlane_b32 s31, v25
+; SI-NEXT:    v_readfirstlane_b32 s26, v24
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    s_lshr_b64 s[94:95], s[30:31], 16
+; SI-NEXT:    s_lshr_b64 s[34:35], s[30:31], 8
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; SI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v23
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT:    v_alignbit_b32 v26, v16, v15, 16
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_readfirstlane_b32 s20, v26
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 8, v9
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:  .LBB91_5: ; %end
-; SI-NEXT:    s_waitcnt vmcnt(4) expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v26, 24, v21
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 24, v17
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_or_b32_e32 v11, v17, v11
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v14
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v44
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v62
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 24, v14
-; SI-NEXT:    v_or_b32_e32 v11, v17, v11
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 4, v0
-; SI-NEXT:    buffer_store_dword v8, v11, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v5
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT:    v_alignbit_b32 v27, v18, v16, 16
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v10
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v60, v12, v3, 16
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; SI-NEXT:    v_readfirstlane_b32 s91, v60
+; SI-NEXT:    v_readfirstlane_b32 s14, v27
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 24, v10
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    s_lshr_b64 s[88:89], s[90:91], 16
+; SI-NEXT:    s_lshr_b64 s[92:93], s[90:91], 8
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; SI-NEXT:    v_alignbit_b32 v29, v20, v18, 16
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT:    v_readfirstlane_b32 s8, v29
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v12
+; SI-NEXT:    v_alignbit_b32 v61, v11, v3, 16
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    v_readfirstlane_b32 s77, v61
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 24, v12
+; SI-NEXT:    s_lshr_b64 s[74:75], s[76:77], 16
+; SI-NEXT:    s_lshr_b64 s[78:79], s[76:77], 8
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v61
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v13
+; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 24, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 24, v18
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_alignbit_b32 v59, v36, v3, 16
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v8
+; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT:    v_readfirstlane_b32 s63, v59
+; SI-NEXT:    s_lshr_b64 s[60:61], s[62:63], 16
+; SI-NEXT:    s_lshr_b64 s[72:73], s[62:63], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 8, v59
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 8, v25
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v24, 24, v20
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v3
+; SI-NEXT:    v_alignbit_b32 v41, v49, v15, 16
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT:    v_readfirstlane_b32 s57, v41
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
+; SI-NEXT:    s_lshr_b64 s[46:47], s[56:57], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[56:57], 8
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 8, v41
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 24, v14
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v15
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_or_b32_e32 v11, v17, v11
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 8, v0
-; SI-NEXT:    buffer_store_dword v8, v11, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v46
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v49
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v47
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v15
+; SI-NEXT:    v_alignbit_b32 v58, v32, v16, 16
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; SI-NEXT:    v_readfirstlane_b32 s43, v58
+; SI-NEXT:    s_lshr_b64 s[40:41], s[42:43], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[42:43], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v20, 24, v15
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v58
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 24, v14
-; SI-NEXT:    v_or_b32_e32 v11, v14, v11
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 12, v0
-; SI-NEXT:    buffer_store_dword v8, v11, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v6
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v63
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 24, v14
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT:    v_alignbit_b32 v47, v45, v16, 16
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT:    v_readfirstlane_b32 s27, v47
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 8, v47
+; SI-NEXT:    s_lshr_b64 s[24:25], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[28:29], s[26:27], 8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_or_b32_e32 v11, v14, v11
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 16, v0
-; SI-NEXT:    buffer_store_dword v8, v11, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v19
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v54
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT:    v_alignbit_b32 v19, v11, v16, 16
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT:    v_readfirstlane_b32 s21, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 8, v19
+; SI-NEXT:    s_lshr_b64 s[18:19], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[22:23], s[20:21], 8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 24, v14
-; SI-NEXT:    v_or_b32_e32 v11, v14, v11
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 20, v0
-; SI-NEXT:    buffer_store_dword v8, v11, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v10
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v39
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT:    v_alignbit_b32 v14, v52, v16, 16
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; SI-NEXT:    v_readfirstlane_b32 s15, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 8, v14
+; SI-NEXT:    s_lshr_b64 s[12:13], s[14:15], 16
+; SI-NEXT:    s_lshr_b64 s[16:17], s[14:15], 8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 24, v14
-; SI-NEXT:    v_or_b32_e32 v11, v14, v11
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 24, v0
-; SI-NEXT:    buffer_store_dword v8, v11, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_alignbit_b32 v11, v44, v16, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 24, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 8, v60
+; SI-NEXT:    v_readfirstlane_b32 s9, v11
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 8, v11
+; SI-NEXT:    s_lshr_b64 s[6:7], s[8:9], 24
+; SI-NEXT:    s_lshr_b64 s[10:11], s[8:9], 8
+; SI-NEXT:    v_writelane_b32 v62, s6, 0
+; SI-NEXT:    v_writelane_b32 v62, s7, 1
+; SI-NEXT:    s_lshr_b64 s[6:7], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[14:15], 24
+; SI-NEXT:    s_lshr_b64 s[14:15], s[20:21], 24
+; SI-NEXT:    s_lshr_b64 s[20:21], s[26:27], 24
+; SI-NEXT:    s_lshr_b64 s[26:27], s[42:43], 24
+; SI-NEXT:    s_lshr_b64 s[42:43], s[56:57], 24
+; SI-NEXT:    s_lshr_b64 s[56:57], s[62:63], 24
+; SI-NEXT:    s_lshr_b64 s[62:63], s[76:77], 24
+; SI-NEXT:    s_lshr_b64 s[76:77], s[90:91], 24
+; SI-NEXT:    s_lshr_b64 s[90:91], s[30:31], 24
+; SI-NEXT:    s_lshr_b64 s[30:31], s[38:39], 24
+; SI-NEXT:    s_lshr_b64 s[38:39], s[38:39], 8
+; SI-NEXT:  .LBB91_5: ; %end
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v29
+; SI-NEXT:    s_lshl_b32 s5, s10, 8
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    s_and_b32 s5, s6, 0xff
+; SI-NEXT:    v_readlane_b32 s6, v62, 0
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    s_lshl_b32 s6, s6, 24
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v50
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 24, v51
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v5
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_or_b32_e32 v11, v14, v11
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 28, v0
-; SI-NEXT:    buffer_store_dword v8, v11, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v11
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v5
+; SI-NEXT:    v_and_b32_e32 v5, 0xff, v44
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v16
+; SI-NEXT:    v_or_b32_e32 v5, v6, v5
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v5
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
+; SI-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v42
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v45
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v36
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    s_lshl_b32 s5, s16, 8
+; SI-NEXT:    s_lshl_b32 s6, s8, 24
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 8, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v26
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-NEXT:    s_lshl_b32 s4, s4, 8
+; SI-NEXT:    v_readlane_b32 s7, v62, 1
+; SI-NEXT:    v_readlane_b32 s99, v63, 35
+; SI-NEXT:    v_readlane_b32 s97, v63, 33
+; SI-NEXT:    v_readlane_b32 s87, v63, 31
+; SI-NEXT:    v_readlane_b32 s85, v63, 29
+; SI-NEXT:    v_readlane_b32 s83, v63, 27
+; SI-NEXT:    v_readlane_b32 s81, v63, 25
+; SI-NEXT:    v_readlane_b32 s71, v63, 23
+; SI-NEXT:    v_readlane_b32 s69, v63, 21
+; SI-NEXT:    v_readlane_b32 s67, v63, 19
+; SI-NEXT:    v_readlane_b32 s65, v63, 17
+; SI-NEXT:    v_readlane_b32 s55, v63, 15
+; SI-NEXT:    v_readlane_b32 s53, v63, 13
+; SI-NEXT:    v_readlane_b32 s51, v63, 11
+; SI-NEXT:    v_readlane_b32 s49, v63, 9
+; SI-NEXT:    v_readlane_b32 s39, v63, 7
+; SI-NEXT:    v_readlane_b32 s37, v63, 5
+; SI-NEXT:    v_readlane_b32 s35, v63, 3
+; SI-NEXT:    v_readlane_b32 s31, v63, 1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 24, v2
-; SI-NEXT:    v_or_b32_e32 v11, v14, v11
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 32, v0
-; SI-NEXT:    buffer_store_dword v8, v11, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    s_and_b32 s5, s12, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v25
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v60
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 24, v2
-; SI-NEXT:    v_or_b32_e32 v11, v14, v11
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 36, v0
-; SI-NEXT:    buffer_store_dword v8, v11, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v7
+; SI-NEXT:    v_or_b32_e32 v1, v1, v5
+; SI-NEXT:    v_and_b32_e32 v5, 0xff, v52
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_or_b32_e32 v5, v6, v5
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v5
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 12, v0
+; SI-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v3
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v33
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    s_lshl_b32 s5, s22, 8
+; SI-NEXT:    s_lshl_b32 s6, s14, 24
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 16, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v24
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 24, v2
-; SI-NEXT:    v_or_b32_e32 v11, v14, v11
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 40, v0
-; SI-NEXT:    buffer_store_dword v8, v11, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    s_and_b32 s5, s18, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v55
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v53
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v19
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v27
+; SI-NEXT:    v_or_b32_e32 v1, v1, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_lshl_b32 s5, s28, 8
+; SI-NEXT:    s_lshl_b32 s6, s20, 24
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 24, v2
-; SI-NEXT:    v_or_b32_e32 v11, v14, v11
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 44, v0
-; SI-NEXT:    buffer_store_dword v8, v11, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_or_b32_e32 v5, v6, v5
+; SI-NEXT:    v_or_b32_e32 v1, v1, v5
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 20, v0
+; SI-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v16
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 24, v14
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 24, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v56
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_or_b32_e32 v11, v14, v11
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 48, v0
-; SI-NEXT:    buffer_store_dword v8, v11, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    s_and_b32 s5, s24, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v47
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v9
+; SI-NEXT:    v_or_b32_e32 v1, v1, v5
+; SI-NEXT:    v_and_b32_e32 v5, 0xff, v45
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_or_b32_e32 v5, v6, v5
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v5
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 28, v0
+; SI-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v38
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT:    s_lshl_b32 s5, s44, 8
+; SI-NEXT:    s_lshl_b32 s6, s26, 24
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 32, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v20
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    s_and_b32 s5, s40, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v58
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v13
+; SI-NEXT:    v_or_b32_e32 v1, v1, v5
+; SI-NEXT:    v_and_b32_e32 v5, 0xff, v32
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v5, v6, v5
+; SI-NEXT:    v_or_b32_e32 v1, v1, v5
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 36, v0
+; SI-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    s_lshl_b32 s5, s58, 8
+; SI-NEXT:    s_lshl_b32 s6, s42, 24
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 40, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 24, v2
-; SI-NEXT:    v_or_b32_e32 v11, v14, v11
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 52, v0
-; SI-NEXT:    buffer_store_dword v8, v11, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    s_and_b32 s5, s46, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v13
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v41
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v12
+; SI-NEXT:    v_or_b32_e32 v1, v1, v5
+; SI-NEXT:    v_and_b32_e32 v5, 0xff, v49
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_lshl_b32 s5, s72, 8
+; SI-NEXT:    s_lshl_b32 s6, s56, 24
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_or_b32_e32 v11, v13, v11
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 56, v0
-; SI-NEXT:    buffer_store_dword v8, v11, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; SI-NEXT:    v_or_b32_e32 v3, v3, v5
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 44, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v35
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v34
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    s_and_b32 s5, s60, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v4
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v59
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v36
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_lshl_b32 s5, s78, 8
+; SI-NEXT:    s_lshl_b32 s6, s62, 24
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 24, v2
-; SI-NEXT:    v_or_b32_e32 v11, v13, v11
-; SI-NEXT:    v_or_b32_e32 v8, v8, v11
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 60, v0
-; SI-NEXT:    buffer_store_dword v8, v11, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; SI-NEXT:    v_or_b32_e32 v3, v4, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v61
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
-; SI-NEXT:    v_or_b32_e32 v8, v8, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT:    v_or_b32_e32 v10, v11, v10
-; SI-NEXT:    v_or_b32_e32 v8, v8, v10
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 64, v0
-; SI-NEXT:    buffer_store_dword v8, v10, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    s_and_b32 s5, s74, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v30
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 8, v2
-; SI-NEXT:    v_or_b32_e32 v8, v8, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v61
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v10
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_lshl_b32 s5, s92, 8
+; SI-NEXT:    s_lshl_b32 s6, s76, 24
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT:    v_or_b32_e32 v10, v11, v10
-; SI-NEXT:    v_or_b32_e32 v8, v8, v10
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 0x44, v0
-; SI-NEXT:    buffer_store_dword v8, v10, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; SI-NEXT:    v_or_b32_e32 v3, v4, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
-; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 64, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_or_b32_e32 v8, v10, v8
-; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x48, v0
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    s_and_b32 s5, s88, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v27
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 8, v2
-; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v60
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v23
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_lshl_b32 s5, s34, 8
+; SI-NEXT:    s_lshl_b32 s6, s90, 24
+; SI-NEXT:    v_readlane_b32 s34, v63, 2
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_or_b32_e32 v8, v10, v8
-; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x4c, v0
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; SI-NEXT:    v_or_b32_e32 v3, v4, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x44, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 24, v40
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; SI-NEXT:    v_or_b32_e32 v6, v6, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x48, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_or_b32_e32 v7, v8, v7
-; SI-NEXT:    v_or_b32_e32 v6, v6, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x50, v0
-; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    s_and_b32 s5, s94, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v24
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v2
-; SI-NEXT:    v_or_b32_e32 v6, v6, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v25
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v18
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_lshl_b32 s5, s38, 8
+; SI-NEXT:    s_lshl_b32 s6, s30, 24
+; SI-NEXT:    v_readlane_b32 s38, v63, 6
+; SI-NEXT:    v_readlane_b32 s30, v63, 0
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_or_b32_e32 v7, v8, v7
-; SI-NEXT:    v_or_b32_e32 v6, v6, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x54, v0
-; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; SI-NEXT:    v_or_b32_e32 v3, v4, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v31
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x50, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; SI-NEXT:    v_or_b32_e32 v6, v7, v6
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x58, v0
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    s_and_b32 s5, s36, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v21
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v22
+; SI-NEXT:    s_lshl_b32 s5, s52, 8
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT:    s_lshl_b32 s6, s48, 24
+; SI-NEXT:    v_readlane_b32 s52, v63, 12
+; SI-NEXT:    v_readlane_b32 s48, v63, 8
+; SI-NEXT:    v_readlane_b32 s36, v63, 4
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v2
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_or_b32_e32 v6, v7, v6
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x5c, v0
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v3, v4, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x54, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; SI-NEXT:    v_or_b32_e32 v4, v4, v5
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x58, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    v_or_b32_e32 v4, v4, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x60, v0
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    s_and_b32 s5, s50, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v18
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v17
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT:    s_lshl_b32 s5, s68, 8
+; SI-NEXT:    s_lshl_b32 s6, s54, 24
+; SI-NEXT:    v_readlane_b32 s68, v63, 20
+; SI-NEXT:    v_readlane_b32 s54, v63, 14
+; SI-NEXT:    v_readlane_b32 s50, v63, 10
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v2
-; SI-NEXT:    v_or_b32_e32 v4, v4, v5
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    v_or_b32_e32 v4, v4, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x64, v0
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v3, v4, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x5c, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; SI-NEXT:    v_or_b32_e32 v3, v3, v4
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x60, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_or_b32_e32 v4, v5, v4
-; SI-NEXT:    v_or_b32_e32 v3, v3, v4
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x68, v0
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    s_and_b32 s5, s64, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v23
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v2
-; SI-NEXT:    v_or_b32_e32 v3, v3, v4
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v28
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v30
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_lshl_b32 s5, s82, 8
+; SI-NEXT:    s_lshl_b32 s6, s66, 24
+; SI-NEXT:    v_readlane_b32 s82, v63, 26
+; SI-NEXT:    v_readlane_b32 s66, v63, 18
+; SI-NEXT:    v_readlane_b32 s64, v63, 16
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_or_b32_e32 v4, v5, v4
-; SI-NEXT:    v_or_b32_e32 v3, v3, v4
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x6c, v0
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v41
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v32
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_or_b32_e32 v3, v4, v3
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x70, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    s_and_b32 s5, s70, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v57
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v57
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT:    s_lshl_b32 s5, s96, 8
+; SI-NEXT:    s_lshl_b32 s6, s80, 24
+; SI-NEXT:    v_readlane_b32 s96, v63, 32
+; SI-NEXT:    v_readlane_b32 s80, v63, 24
+; SI-NEXT:    v_readlane_b32 s70, v63, 22
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_or_b32_e32 v3, v4, v3
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x74, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    s_and_b32 s5, s84, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_or_b32_e32 v1, s5, v1
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v26
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v46
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT:    s_lshl_b32 s5, s86, 24
+; SI-NEXT:    v_readlane_b32 s86, v63, 30
+; SI-NEXT:    v_readlane_b32 s84, v63, 28
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v43
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v29
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v20
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v35
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_or_b32_e32 v1, s4, v1
+; SI-NEXT:    s_and_b32 s4, s98, 0xff
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    v_or_b32_e32 v1, s4, v1
+; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v38
+; SI-NEXT:    v_readlane_b32 s98, v63, 34
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
@@ -164016,22 +165748,24 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -186200,23 +187934,24 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; SI-LABEL: bitcast_v64f16_to_v128i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:80
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32
@@ -186225,1829 +187960,1684 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:16
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:72
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:24
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:36
+; SI-NEXT:    s_waitcnt expcnt(4)
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:32
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:44
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_cvt_f16_f32_e32 v63, v5
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:56
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:72
+; SI-NEXT:    v_writelane_b32 v63, s30, 0
+; SI-NEXT:    v_writelane_b32 v63, s31, 1
+; SI-NEXT:    v_writelane_b32 v63, s34, 2
+; SI-NEXT:    v_writelane_b32 v63, s35, 3
+; SI-NEXT:    v_writelane_b32 v63, s36, 4
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_writelane_b32 v63, s37, 5
+; SI-NEXT:    v_writelane_b32 v63, s38, 6
+; SI-NEXT:    v_writelane_b32 v63, s39, 7
+; SI-NEXT:    v_writelane_b32 v63, s48, 8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v29, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v11
-; SI-NEXT:    v_mov_b32_e32 v59, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v43, v4
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v7
+; SI-NEXT:    v_writelane_b32 v63, s49, 9
+; SI-NEXT:    v_writelane_b32 v63, s50, 10
+; SI-NEXT:    v_writelane_b32 v63, s51, 11
+; SI-NEXT:    v_writelane_b32 v63, s52, 12
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v42, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v9
+; SI-NEXT:    v_writelane_b32 v63, s53, 13
+; SI-NEXT:    v_writelane_b32 v63, s54, 14
+; SI-NEXT:    v_writelane_b32 v63, s55, 15
+; SI-NEXT:    v_writelane_b32 v63, s64, 16
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v26
+; SI-NEXT:    v_writelane_b32 v63, s65, 17
+; SI-NEXT:    v_writelane_b32 v63, s66, 18
+; SI-NEXT:    v_writelane_b32 v63, s67, 19
+; SI-NEXT:    v_writelane_b32 v63, s68, 20
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v25
+; SI-NEXT:    v_writelane_b32 v63, s69, 21
+; SI-NEXT:    v_writelane_b32 v63, s70, 22
+; SI-NEXT:    v_writelane_b32 v63, s71, 23
+; SI-NEXT:    v_writelane_b32 v63, s80, 24
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v30
+; SI-NEXT:    v_writelane_b32 v63, s81, 25
+; SI-NEXT:    v_writelane_b32 v63, s82, 26
+; SI-NEXT:    v_writelane_b32 v63, s83, 27
+; SI-NEXT:    v_writelane_b32 v63, s84, 28
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    v_writelane_b32 v63, s85, 29
+; SI-NEXT:    v_writelane_b32 v63, s86, 30
+; SI-NEXT:    v_writelane_b32 v63, s87, 31
+; SI-NEXT:    v_mov_b32_e32 v46, v29
+; SI-NEXT:    v_writelane_b32 v63, s96, 32
+; SI-NEXT:    v_cvt_f16_f32_e32 v47, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v33, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v61, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v43, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v32, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v29, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v31, v14
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v15
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v34
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v17
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, v59
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v20
-; SI-NEXT:    v_cvt_f16_f32_e32 v59, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v37
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v9, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v56, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v62, v23
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v26
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
-; SI-NEXT:    v_cvt_f16_f32_e32 v26, v28
-; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v30
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, s16
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, s18
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v20, s23
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, s22
-; SI-NEXT:    v_cvt_f16_f32_e32 v22, s25
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, s24
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, s27
-; SI-NEXT:    v_cvt_f16_f32_e32 v19, s26
-; SI-NEXT:    v_cvt_f16_f32_e32 v24, s29
-; SI-NEXT:    v_cvt_f16_f32_e32 v23, s28
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v21, v24
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, v23
+; SI-NEXT:    v_cvt_f16_f32_e32 v44, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v42, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v46, v46
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v35
+; SI-NEXT:    v_cvt_f16_f32_e32 v23, v36
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v38
+; SI-NEXT:    v_cvt_f16_f32_e32 v27, v39
+; SI-NEXT:    v_cvt_f16_f32_e32 v28, v48
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v49
+; SI-NEXT:    v_cvt_f16_f32_e32 v45, v45
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, v56
+; SI-NEXT:    v_cvt_f16_f32_e32 v57, v57
+; SI-NEXT:    v_cvt_f16_f32_e32 v59, v59
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, v60
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v34
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v30, v35
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v36
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, v38
-; SI-NEXT:    v_cvt_f16_f32_e32 v38, v39
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v48
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, v49
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, v51
+; SI-NEXT:    v_cvt_f16_f32_e32 v26, v50
 ; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v52
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(13) expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, v53
+; SI-NEXT:    v_cvt_f16_f32_e32 v25, v51
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, v54
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_cvt_f16_f32_e32 v51, v55
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v53
 ; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v40
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v49, v54
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_cvt_f16_f32_e32 v40, v44
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, s17
-; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, v47
-; SI-NEXT:    v_cvt_f16_f32_e32 v44, s19
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, s21
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_cvt_f16_f32_e32 v28, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v37
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v41
-; SI-NEXT:    v_cvt_f16_f32_e32 v41, v45
-; SI-NEXT:    v_cvt_f16_f32_e32 v45, v46
-; SI-NEXT:    v_cvt_f16_f32_e32 v46, v57
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, v55
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v40
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_cvt_f16_f32_e32 v55, v41
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, s16
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, s19
+; SI-NEXT:    v_cvt_f16_f32_e32 v17, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v38, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v37, s20
+; SI-NEXT:    v_cvt_f16_f32_e32 v48, s23
+; SI-NEXT:    v_cvt_f16_f32_e32 v22, s22
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, s25
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, s24
+; SI-NEXT:    v_cvt_f16_f32_e32 v30, s27
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, s26
+; SI-NEXT:    v_cvt_f16_f32_e32 v35, s29
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, s28
+; SI-NEXT:    v_writelane_b32 v63, s97, 33
+; SI-NEXT:    v_writelane_b32 v63, s98, 34
+; SI-NEXT:    v_writelane_b32 v63, s99, 35
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB95_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_or_b32_e32 v13, v13, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v44
-; SI-NEXT:    v_or_b32_e32 v55, v12, v11
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v17
-; SI-NEXT:    v_or_b32_e32 v57, v16, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v20
-; SI-NEXT:    v_or_b32_e32 v17, v14, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v22
-; SI-NEXT:    v_or_b32_e32 v21, v21, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v15
-; SI-NEXT:    v_or_b32_e32 v16, v19, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v24
-; SI-NEXT:    v_or_b32_e32 v19, v23, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v29
-; SI-NEXT:    v_or_b32_e32 v47, v60, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v43
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_or_b32_e32 v43, v42, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v33
-; SI-NEXT:    v_or_b32_e32 v14, v63, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v61
-; SI-NEXT:    v_or_b32_e32 v58, v58, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v32
-; SI-NEXT:    s_mov_b64 s[4:5], 0
+; SI-NEXT:    v_readfirstlane_b32 s4, v19
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v18
+; SI-NEXT:    s_or_b32 s44, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v12
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v17
+; SI-NEXT:    s_or_b32 s45, s5, s4
+; SI-NEXT:    s_lshr_b64 s[4:5], s[44:45], 24
+; SI-NEXT:    v_writelane_b32 v62, s4, 4
+; SI-NEXT:    v_writelane_b32 v62, s5, 5
+; SI-NEXT:    s_lshr_b64 s[4:5], s[44:45], 16
+; SI-NEXT:    v_writelane_b32 v62, s4, 2
+; SI-NEXT:    v_writelane_b32 v62, s5, 3
+; SI-NEXT:    s_lshr_b64 s[4:5], s[44:45], 8
+; SI-NEXT:    v_writelane_b32 v62, s4, 0
+; SI-NEXT:    v_writelane_b32 v62, s5, 1
+; SI-NEXT:    v_readfirstlane_b32 s4, v38
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v37
+; SI-NEXT:    s_or_b32 s42, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v48
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v22
+; SI-NEXT:    s_or_b32 s43, s5, s4
+; SI-NEXT:    s_lshr_b64 s[4:5], s[42:43], 24
+; SI-NEXT:    v_writelane_b32 v62, s4, 10
+; SI-NEXT:    v_writelane_b32 v62, s5, 11
+; SI-NEXT:    s_lshr_b64 s[4:5], s[42:43], 16
+; SI-NEXT:    v_writelane_b32 v62, s4, 8
+; SI-NEXT:    v_writelane_b32 v62, s5, 9
+; SI-NEXT:    s_lshr_b64 s[4:5], s[42:43], 8
+; SI-NEXT:    v_writelane_b32 v62, s4, 6
+; SI-NEXT:    v_writelane_b32 v62, s5, 7
+; SI-NEXT:    v_readfirstlane_b32 s5, v14
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    v_readfirstlane_b32 s4, v15
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    s_or_b32 s40, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v30
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v39
+; SI-NEXT:    s_or_b32 s41, s5, s4
+; SI-NEXT:    s_lshr_b64 s[4:5], s[40:41], 24
+; SI-NEXT:    v_writelane_b32 v62, s4, 16
+; SI-NEXT:    v_writelane_b32 v62, s5, 17
+; SI-NEXT:    s_lshr_b64 s[4:5], s[40:41], 16
+; SI-NEXT:    v_writelane_b32 v62, s4, 14
+; SI-NEXT:    v_writelane_b32 v62, s5, 15
+; SI-NEXT:    s_lshr_b64 s[4:5], s[40:41], 8
+; SI-NEXT:    v_writelane_b32 v62, s4, 12
+; SI-NEXT:    v_writelane_b32 v62, s5, 13
+; SI-NEXT:    v_readfirstlane_b32 s4, v35
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v20
+; SI-NEXT:    s_or_b32 s28, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v47
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_mov_b32_e32 v22, v2
+; SI-NEXT:    v_mov_b32_e32 v39, v5
+; SI-NEXT:    v_mov_b32_e32 v60, v16
+; SI-NEXT:    v_readfirstlane_b32 s46, v55
+; SI-NEXT:    v_mov_b32_e32 v17, v43
+; SI-NEXT:    v_mov_b32_e32 v40, v34
+; SI-NEXT:    v_mov_b32_e32 v41, v21
+; SI-NEXT:    v_mov_b32_e32 v51, v42
+; SI-NEXT:    s_lshr_b32 s71, s45, 8
+; SI-NEXT:    s_lshr_b32 s70, s43, 8
+; SI-NEXT:    s_lshr_b32 s69, s41, 8
+; SI-NEXT:    v_bfe_u32 v38, v47, 8, 8
+; SI-NEXT:    v_bfe_u32 v37, v33, 8, 8
+; SI-NEXT:    v_bfe_u32 v35, v32, 8, 8
+; SI-NEXT:    v_bfe_u32 v20, v10, 8, 8
+; SI-NEXT:    v_bfe_u32 v19, v9, 8, 8
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v55, v4
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_readfirstlane_b32 s5, v14
+; SI-NEXT:    s_or_b32 s29, s5, s4
+; SI-NEXT:    s_lshr_b64 s[4:5], s[28:29], 24
+; SI-NEXT:    v_writelane_b32 v62, s4, 22
+; SI-NEXT:    v_writelane_b32 v62, s5, 23
+; SI-NEXT:    s_lshr_b64 s[4:5], s[28:29], 16
+; SI-NEXT:    v_writelane_b32 v62, s4, 20
+; SI-NEXT:    v_writelane_b32 v62, s5, 21
+; SI-NEXT:    s_lshr_b64 s[4:5], s[28:29], 8
+; SI-NEXT:    v_writelane_b32 v62, s4, 18
+; SI-NEXT:    v_writelane_b32 v62, s5, 19
+; SI-NEXT:    v_readfirstlane_b32 s4, v2
+; SI-NEXT:    v_mov_b32_e32 v2, v1
+; SI-NEXT:    v_readfirstlane_b32 s5, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    s_or_b32 s26, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v33
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v43
+; SI-NEXT:    s_or_b32 s27, s5, s4
+; SI-NEXT:    s_lshr_b64 s[4:5], s[26:27], 24
+; SI-NEXT:    v_writelane_b32 v62, s4, 28
+; SI-NEXT:    v_writelane_b32 v62, s5, 29
+; SI-NEXT:    s_lshr_b64 s[4:5], s[26:27], 16
+; SI-NEXT:    v_writelane_b32 v62, s4, 26
+; SI-NEXT:    v_writelane_b32 v62, s5, 27
+; SI-NEXT:    s_lshr_b64 s[4:5], s[26:27], 8
+; SI-NEXT:    v_writelane_b32 v62, s4, 24
+; SI-NEXT:    v_writelane_b32 v62, s5, 25
+; SI-NEXT:    v_readfirstlane_b32 s4, v5
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    s_lshr_b32 s68, s29, 8
+; SI-NEXT:    s_lshr_b32 s66, s27, 8
+; SI-NEXT:    v_bfe_u32 v43, v31, 8, 8
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_readfirstlane_b32 s5, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    s_or_b32 s24, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v32
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_bfe_u32 v15, v5, 8, 8
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_readfirstlane_b32 s5, v1
+; SI-NEXT:    s_or_b32 s25, s5, s4
+; SI-NEXT:    s_lshr_b64 s[4:5], s[24:25], 24
+; SI-NEXT:    v_writelane_b32 v62, s4, 34
+; SI-NEXT:    v_writelane_b32 v62, s5, 35
+; SI-NEXT:    s_lshr_b64 s[4:5], s[24:25], 16
+; SI-NEXT:    v_writelane_b32 v62, s4, 32
+; SI-NEXT:    v_writelane_b32 v62, s5, 33
+; SI-NEXT:    s_lshr_b64 s[4:5], s[24:25], 8
+; SI-NEXT:    v_writelane_b32 v62, s4, 30
+; SI-NEXT:    v_writelane_b32 v62, s5, 31
+; SI-NEXT:    v_readfirstlane_b32 s4, v7
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v29
+; SI-NEXT:    s_or_b32 s22, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v31
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v6
+; SI-NEXT:    s_or_b32 s23, s5, s4
+; SI-NEXT:    s_lshr_b64 s[4:5], s[22:23], 24
+; SI-NEXT:    v_writelane_b32 v62, s4, 40
+; SI-NEXT:    v_writelane_b32 v62, s5, 41
+; SI-NEXT:    s_lshr_b64 s[4:5], s[22:23], 16
+; SI-NEXT:    v_writelane_b32 v62, s4, 38
+; SI-NEXT:    v_writelane_b32 v62, s5, 39
+; SI-NEXT:    s_lshr_b64 s[4:5], s[22:23], 8
+; SI-NEXT:    v_writelane_b32 v62, s4, 36
+; SI-NEXT:    v_writelane_b32 v62, s5, 37
+; SI-NEXT:    v_readfirstlane_b32 s4, v58
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v13
+; SI-NEXT:    s_or_b32 s20, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v10
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v11
+; SI-NEXT:    s_or_b32 s21, s5, s4
+; SI-NEXT:    s_lshr_b64 s[4:5], s[20:21], 24
+; SI-NEXT:    v_writelane_b32 v62, s4, 44
+; SI-NEXT:    v_writelane_b32 v62, s5, 45
+; SI-NEXT:    s_lshr_b64 s[4:5], s[20:21], 16
+; SI-NEXT:    v_mov_b32_e32 v58, v11
+; SI-NEXT:    v_writelane_b32 v62, s4, 42
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    v_writelane_b32 v62, s5, 43
+; SI-NEXT:    v_readfirstlane_b32 s5, v16
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v7, v29
+; SI-NEXT:    v_mov_b32_e32 v29, v6
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    v_readfirstlane_b32 s4, v34
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    s_or_b32 s18, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v9
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v61
+; SI-NEXT:    s_or_b32 s19, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v21
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v24
+; SI-NEXT:    s_or_b32 s16, s5, s4
+; SI-NEXT:    v_mov_b32_e32 v1, v53
+; SI-NEXT:    v_mov_b32_e32 v34, v61
+; SI-NEXT:    v_mov_b32_e32 v21, v24
+; SI-NEXT:    s_lshr_b32 s64, s25, 8
+; SI-NEXT:    s_lshr_b32 s54, s23, 8
+; SI-NEXT:    s_lshr_b32 s52, s21, 8
+; SI-NEXT:    s_lshr_b32 s50, s19, 8
+; SI-NEXT:    s_lshr_b64 s[62:63], s[20:21], 8
+; SI-NEXT:    s_lshr_b64 s[96:97], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[98:99], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[18:19], 8
+; SI-NEXT:    v_mov_b32_e32 v13, v12
+; SI-NEXT:    v_bfe_u32 v24, v12, 8, 8
+; SI-NEXT:    v_mov_b32_e32 v12, v48
+; SI-NEXT:    v_bfe_u32 v48, v48, 8, 8
+; SI-NEXT:    v_bfe_u32 v61, v59, 8, 8
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_readfirstlane_b32 s4, v11
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_bfe_u32 v18, v11, 8, 8
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_readfirstlane_b32 s5, v16
+; SI-NEXT:    s_or_b32 s17, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v44
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v42
+; SI-NEXT:    s_or_b32 s14, s5, s4
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v60, v12, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v3
-; SI-NEXT:    v_or_b32_e32 v22, v2, v11
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v31
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v52
-; SI-NEXT:    v_or_b32_e32 v12, v46, v12
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v24, v2, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; SI-NEXT:    v_or_b32_e32 v4, v4, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
-; SI-NEXT:    v_or_b32_e32 v34, v34, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
-; SI-NEXT:    v_or_b32_e32 v3, v59, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v9
-; SI-NEXT:    v_or_b32_e32 v59, v56, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v18
-; SI-NEXT:    v_or_b32_e32 v6, v62, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v7
-; SI-NEXT:    v_or_b32_e32 v62, v25, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v26
-; SI-NEXT:    v_or_b32_e32 v2, v27, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
-; SI-NEXT:    v_or_b32_e32 v25, v28, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v30
-; SI-NEXT:    v_or_b32_e32 v1, v36, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
-; SI-NEXT:    v_or_b32_e32 v23, v35, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v38
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v18, v39, v11
-; SI-NEXT:    v_mov_b32_e32 v36, v2
-; SI-NEXT:    v_mov_b32_e32 v35, v1
-; SI-NEXT:    v_alignbit_b32 v1, v55, v13, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v55, v13, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v55, v13, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v17, v57, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v17, v57, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v17, v57, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v16, v21, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v16, v21, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v16, v21, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v47, v19, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v47, v19, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v47, v19, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v14, v43, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v14, v43, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v14, v43, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v60, v58, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v60, v58, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v60, v58, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v24, v22, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v24, v22, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v24, v22, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v34, v4, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v34, v4, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v34, v4, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v59, v3, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v59, v3, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v59, v3, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v38
-; SI-NEXT:    v_or_b32_e32 v61, v50, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v49
-; SI-NEXT:    v_or_b32_e32 v2, v48, v11
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v49, v6
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v62, v49, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v62, v49, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v62, v49, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v25, v36, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v25, v36, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v25, v36, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v23, v35, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v23, v35, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v23, v35, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v61, v18, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v61, v18, 16
-; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v48
-; SI-NEXT:    v_or_b32_e32 v56, v54, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v51
-; SI-NEXT:    v_or_b32_e32 v6, v53, v11
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_alignbit_b32 v1, v61, v18, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v56, v2, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_alignbit_b32 v1, v56, v2, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v55
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v17
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v47
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v14
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v60
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v34
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v59
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v62
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v25
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v23
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v37
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v61
-; SI-NEXT:    v_or_b32_e32 v54, v40, v11
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v56
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v54
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v12
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v44, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v20, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v15, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v29, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v33, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v32, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v31, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v10, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v9, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v8, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v5, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v38, 8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v48, 8, 8
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v41
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v37, 8, 8
-; SI-NEXT:    v_or_b32_e32 v11, v45, v11
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v1, v52, 8, 8
-; SI-NEXT:    v_alignbit_b32 v28, v56, v2, 24
-; SI-NEXT:    v_alignbit_b32 v2, v54, v6, 24
-; SI-NEXT:    v_alignbit_b32 v39, v54, v6, 16
-; SI-NEXT:    v_alignbit_b32 v30, v54, v6, 8
-; SI-NEXT:    v_alignbit_b32 v27, v12, v11, 24
-; SI-NEXT:    v_alignbit_b32 v42, v12, v11, 16
-; SI-NEXT:    v_alignbit_b32 v26, v12, v11, 8
-; SI-NEXT:    v_mov_b32_e32 v20, v29
-; SI-NEXT:    v_mov_b32_e32 v15, v33
-; SI-NEXT:    v_mov_b32_e32 v33, v32
+; SI-NEXT:    v_readfirstlane_b32 s4, v6
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v46
+; SI-NEXT:    s_or_b32 s15, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v8
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v23
+; SI-NEXT:    s_or_b32 s12, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v5
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v3
+; SI-NEXT:    s_or_b32 s13, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v27
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v28
+; SI-NEXT:    s_or_b32 s10, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v4
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v45
+; SI-NEXT:    s_or_b32 s11, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v36
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v57
+; SI-NEXT:    s_or_b32 s8, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v59
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v56
+; SI-NEXT:    s_or_b32 s9, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v26
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v25
+; SI-NEXT:    s_or_b32 s6, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v52
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v50
+; SI-NEXT:    s_or_b32 s7, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v49
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v54
+; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s5, v53
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    s_or_b32 s5, s46, s5
+; SI-NEXT:    s_lshr_b64 vcc, s[4:5], 24
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 48
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 49
+; SI-NEXT:    s_lshr_b64 vcc, s[4:5], 8
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 46
+; SI-NEXT:    s_lshr_b32 s48, s17, 8
+; SI-NEXT:    s_lshr_b32 s67, s15, 8
+; SI-NEXT:    s_lshr_b32 s65, s13, 8
+; SI-NEXT:    s_lshr_b32 s55, s11, 8
+; SI-NEXT:    s_lshr_b32 s53, s9, 8
+; SI-NEXT:    s_lshr_b32 s51, s7, 8
+; SI-NEXT:    s_lshr_b32 s49, s5, 8
+; SI-NEXT:    s_lshr_b64 s[34:35], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[36:37], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[38:39], s[16:17], 8
+; SI-NEXT:    s_lshr_b64 s[92:93], s[14:15], 24
+; SI-NEXT:    s_lshr_b64 s[94:95], s[14:15], 16
+; SI-NEXT:    s_lshr_b64 s[30:31], s[14:15], 8
+; SI-NEXT:    s_lshr_b64 s[78:79], s[12:13], 24
+; SI-NEXT:    s_lshr_b64 s[88:89], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[90:91], s[12:13], 8
+; SI-NEXT:    s_lshr_b64 s[72:73], s[10:11], 24
+; SI-NEXT:    s_lshr_b64 s[74:75], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[10:11], 8
+; SI-NEXT:    s_lshr_b64 s[46:47], s[8:9], 24
+; SI-NEXT:    s_lshr_b64 s[56:57], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[8:9], 8
+; SI-NEXT:    s_lshr_b64 s[82:83], s[6:7], 24
+; SI-NEXT:    s_lshr_b64 s[84:85], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[86:87], s[6:7], 8
+; SI-NEXT:    s_lshr_b64 s[80:81], s[4:5], 16
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 47
+; SI-NEXT:    s_mov_b64 vcc, 0
+; SI-NEXT:    v_mov_b32_e32 v57, v30
+; SI-NEXT:    v_bfe_u32 v50, v30, 8, 8
+; SI-NEXT:    v_mov_b32_e32 v56, v47
+; SI-NEXT:    v_mov_b32_e32 v46, v33
+; SI-NEXT:    v_mov_b32_e32 v30, v32
 ; SI-NEXT:    v_mov_b32_e32 v32, v31
 ; SI-NEXT:    v_mov_b32_e32 v31, v10
-; SI-NEXT:    v_mov_b32_e32 v10, v9
-; SI-NEXT:    v_mov_b32_e32 v9, v7
-; SI-NEXT:    v_bfe_u32 v29, v7, 8, 8
-; SI-NEXT:    v_mov_b32_e32 v7, v8
-; SI-NEXT:    v_mov_b32_e32 v8, v5
-; SI-NEXT:    v_mov_b32_e32 v44, v37
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v54, v9
+; SI-NEXT:    v_bfe_u32 v42, v6, 8, 8
+; SI-NEXT:    v_bfe_u32 v45, v4, 8, 8
+; SI-NEXT:    v_bfe_u32 v47, v52, 8, 8
+; SI-NEXT:    v_bfe_u32 v33, v1, 8, 8
+; SI-NEXT:    v_mov_b32_e32 v3, v14
+; SI-NEXT:    v_mov_b32_e32 v25, v59
+; SI-NEXT:    v_mov_b32_e32 v1, v52
+; SI-NEXT:    v_mov_b32_e32 v44, v11
 ; SI-NEXT:    s_branch .LBB95_3
 ; SI-NEXT:  .LBB95_2:
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    v_mov_b32_e32 v20, v29
-; SI-NEXT:    v_mov_b32_e32 v15, v33
-; SI-NEXT:    v_mov_b32_e32 v33, v32
+; SI-NEXT:    v_mov_b32_e32 v60, v16
+; SI-NEXT:    s_waitcnt expcnt(4)
+; SI-NEXT:    v_mov_b32_e32 v7, v29
+; SI-NEXT:    v_mov_b32_e32 v29, v6
+; SI-NEXT:    v_mov_b32_e32 v39, v5
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr80
+; SI-NEXT:    v_mov_b32_e32 v51, v42
+; SI-NEXT:    v_writelane_b32 v62, s4, 0
+; SI-NEXT:    v_writelane_b32 v62, s5, 1
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v41, v21
+; SI-NEXT:    v_writelane_b32 v62, s4, 2
+; SI-NEXT:    v_writelane_b32 v62, s5, 3
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v21, v24
+; SI-NEXT:    v_writelane_b32 v62, s4, 4
+; SI-NEXT:    v_writelane_b32 v62, s5, 5
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v40, v34
+; SI-NEXT:    v_writelane_b32 v62, s4, 6
+; SI-NEXT:    v_writelane_b32 v62, s5, 7
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v34, v61
+; SI-NEXT:    v_writelane_b32 v62, s4, 8
+; SI-NEXT:    v_writelane_b32 v62, s5, 9
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v13, v12
+; SI-NEXT:    v_writelane_b32 v62, s4, 10
+; SI-NEXT:    v_writelane_b32 v62, s5, 11
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v12, v48
+; SI-NEXT:    v_writelane_b32 v62, s4, 12
+; SI-NEXT:    v_writelane_b32 v62, s5, 13
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v57, v30
+; SI-NEXT:    v_writelane_b32 v62, s4, 14
+; SI-NEXT:    v_writelane_b32 v62, s5, 15
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v58, v11
+; SI-NEXT:    v_writelane_b32 v62, s4, 16
+; SI-NEXT:    v_writelane_b32 v62, s5, 17
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v56, v47
+; SI-NEXT:    v_writelane_b32 v62, s4, 18
+; SI-NEXT:    v_writelane_b32 v62, s5, 19
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v46, v33
+; SI-NEXT:    v_writelane_b32 v62, s4, 20
+; SI-NEXT:    v_writelane_b32 v62, s5, 21
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v30, v32
+; SI-NEXT:    v_writelane_b32 v62, s4, 22
+; SI-NEXT:    v_writelane_b32 v62, s5, 23
+; SI-NEXT:    ; implicit-def: $sgpr4
 ; SI-NEXT:    v_mov_b32_e32 v32, v31
+; SI-NEXT:    v_writelane_b32 v62, s4, 24
+; SI-NEXT:    v_writelane_b32 v62, s5, 25
+; SI-NEXT:    ; implicit-def: $sgpr4
 ; SI-NEXT:    v_mov_b32_e32 v31, v10
-; SI-NEXT:    v_mov_b32_e32 v10, v9
-; SI-NEXT:    v_mov_b32_e32 v9, v7
-; SI-NEXT:    v_mov_b32_e32 v7, v8
-; SI-NEXT:    v_mov_b32_e32 v8, v5
-; SI-NEXT:    v_mov_b32_e32 v44, v37
-; SI-NEXT:    s_mov_b64 s[4:5], -1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr55
-; SI-NEXT:    ; implicit-def: $vgpr57
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr47
-; SI-NEXT:    ; implicit-def: $vgpr43
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $vgpr58
-; SI-NEXT:    ; implicit-def: $vgpr60
-; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    v_writelane_b32 v62, s4, 26
+; SI-NEXT:    v_writelane_b32 v62, s5, 27
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_mov_b32_e32 v54, v9
+; SI-NEXT:    v_writelane_b32 v62, s4, 28
+; SI-NEXT:    v_writelane_b32 v62, s5, 29
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v55, v4
+; SI-NEXT:    v_writelane_b32 v62, s4, 30
+; SI-NEXT:    v_writelane_b32 v62, s5, 31
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v22, v2
+; SI-NEXT:    v_writelane_b32 v62, s4, 32
+; SI-NEXT:    v_writelane_b32 v62, s5, 33
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v2, v1
+; SI-NEXT:    v_writelane_b32 v62, s4, 34
+; SI-NEXT:    v_writelane_b32 v62, s5, 35
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v17, v43
+; SI-NEXT:    v_writelane_b32 v62, s4, 36
+; SI-NEXT:    v_writelane_b32 v62, s5, 37
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    s_mov_b64 vcc, -1
+; SI-NEXT:    v_writelane_b32 v62, s4, 38
+; SI-NEXT:    v_writelane_b32 v62, s5, 39
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v25, v59
+; SI-NEXT:    v_writelane_b32 v62, s4, 40
+; SI-NEXT:    v_writelane_b32 v62, s5, 41
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_mov_b32_e32 v1, v52
+; SI-NEXT:    v_writelane_b32 v62, s4, 42
+; SI-NEXT:    v_writelane_b32 v62, s5, 43
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr44
+; SI-NEXT:    ; implicit-def: $sgpr71
 ; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr34
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr59
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; implicit-def: $vgpr62
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr25
+; SI-NEXT:    ; implicit-def: $sgpr42
+; SI-NEXT:    ; implicit-def: $sgpr70
+; SI-NEXT:    ; implicit-def: $vgpr48
+; SI-NEXT:    ; implicit-def: $sgpr40
+; SI-NEXT:    ; implicit-def: $sgpr69
+; SI-NEXT:    ; implicit-def: $vgpr50
+; SI-NEXT:    ; implicit-def: $sgpr28
+; SI-NEXT:    ; implicit-def: $sgpr68
+; SI-NEXT:    ; implicit-def: $vgpr38
+; SI-NEXT:    ; implicit-def: $sgpr26
+; SI-NEXT:    ; implicit-def: $sgpr66
+; SI-NEXT:    ; implicit-def: $vgpr37
+; SI-NEXT:    ; implicit-def: $sgpr24
+; SI-NEXT:    ; implicit-def: $sgpr64
 ; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr61
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr56
-; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr54
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr26
+; SI-NEXT:    ; implicit-def: $sgpr22
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; implicit-def: $vgpr43
+; SI-NEXT:    ; implicit-def: $sgpr20
+; SI-NEXT:    ; implicit-def: $sgpr62
+; SI-NEXT:    ; implicit-def: $sgpr52
+; SI-NEXT:    ; implicit-def: $vgpr20
+; SI-NEXT:    ; implicit-def: $sgpr18
+; SI-NEXT:    ; implicit-def: $sgpr58
+; SI-NEXT:    ; implicit-def: $sgpr98
+; SI-NEXT:    ; implicit-def: $sgpr96
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; implicit-def: $vgpr19
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $vgpr18
+; SI-NEXT:    ; implicit-def: $sgpr67
 ; SI-NEXT:    ; implicit-def: $vgpr42
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr65
+; SI-NEXT:    ; implicit-def: $vgpr15
+; SI-NEXT:    ; implicit-def: $sgpr55
+; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; implicit-def: $sgpr53
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; implicit-def: $sgpr51
+; SI-NEXT:    ; implicit-def: $vgpr47
+; SI-NEXT:    ; implicit-def: $sgpr49
+; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $sgpr16
+; SI-NEXT:    ; implicit-def: $sgpr38
+; SI-NEXT:    ; implicit-def: $sgpr36
+; SI-NEXT:    ; implicit-def: $sgpr34
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr30
+; SI-NEXT:    ; implicit-def: $sgpr94
+; SI-NEXT:    ; implicit-def: $sgpr92
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr90
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    ; implicit-def: $sgpr78
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr76
+; SI-NEXT:    ; implicit-def: $sgpr74
+; SI-NEXT:    ; implicit-def: $sgpr72
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr56
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr86
+; SI-NEXT:    ; implicit-def: $sgpr84
+; SI-NEXT:    ; implicit-def: $sgpr82
+; SI-NEXT:    v_writelane_b32 v62, s4, 44
+; SI-NEXT:    v_writelane_b32 v62, s5, 45
+; SI-NEXT:    v_writelane_b32 v62, s80, 46
+; SI-NEXT:    v_writelane_b32 v62, s81, 47
+; SI-NEXT:    ; implicit-def: $sgpr80
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v62, s80, 48
+; SI-NEXT:    v_writelane_b32 v62, s81, 49
+; SI-NEXT:    ; implicit-def: $sgpr80
 ; SI-NEXT:  .LBB95_3: ; %Flow
-; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v1, v44
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v5, v8
-; SI-NEXT:    v_mov_b32_e32 v6, v7
-; SI-NEXT:    v_mov_b32_e32 v7, v9
-; SI-NEXT:    v_mov_b32_e32 v8, v10
-; SI-NEXT:    v_mov_b32_e32 v9, v31
-; SI-NEXT:    v_mov_b32_e32 v31, v33
-; SI-NEXT:    v_mov_b32_e32 v44, v15
-; SI-NEXT:    v_mov_b32_e32 v33, v20
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v38, v2
-; SI-NEXT:    v_mov_b32_e32 v53, v30
-; SI-NEXT:    v_mov_b32_e32 v40, v28
-; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_mov_b32_e32 v2, v48
-; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v30, v29
-; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload
-; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; SI-NEXT:    v_mov_b32_e32 v50, v42
-; SI-NEXT:    v_mov_b32_e32 v42, v26
-; SI-NEXT:    v_mov_b32_e32 v26, v52
+; SI-NEXT:    v_mov_b32_e32 v14, v17
+; SI-NEXT:    v_mov_b32_e32 v17, v39
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    s_andn2_b64 vcc, exec, vcc
+; SI-NEXT:    v_mov_b32_e32 v23, v2
+; SI-NEXT:    v_mov_b32_e32 v59, v34
+; SI-NEXT:    v_mov_b32_e32 v2, v25
 ; SI-NEXT:    s_cbranch_vccnz .LBB95_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v52, v29
+; SI-NEXT:    v_mov_b32_e32 v29, v7
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v18, v14
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_mov_b32_e32 v15, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v53
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v55
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v3
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v55, v4
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v54
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v31
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v37
-; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v38, v41
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, v33
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, v8
 ; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT:    v_add_f32_e32 v38, 0x38000000, v38
-; SI-NEXT:    v_cvt_f16_f32_e32 v38, v38
-; SI-NEXT:    v_add_f32_e32 v31, 0x38000000, v31
-; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT:    v_add_f32_e32 v33, 0x38000000, v33
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v33
-; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
-; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v32
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v57
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v10
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cvt_f32_f16_e32 v49, v49
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cvt_f32_f16_e32 v39, v39
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v50, v50
-; SI-NEXT:    v_add_f32_e32 v39, 0x38000000, v39
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
-; SI-NEXT:    v_add_f32_e32 v50, 0x38000000, v50
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v26
-; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
-; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v34
-; SI-NEXT:    v_or_b32_e32 v11, v12, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v57, v11
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT:    v_or_b32_e32 v12, v12, v13
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; SI-NEXT:    v_bfe_u32 v50, v57, 8, 8
+; SI-NEXT:    v_bfe_u32 v48, v12, 8, 8
+; SI-NEXT:    v_bfe_u32 v43, v32, 8, 8
+; SI-NEXT:    v_bfe_u32 v24, v13, 8, 8
+; SI-NEXT:    v_bfe_u32 v20, v31, 8, 8
+; SI-NEXT:    v_bfe_u32 v19, v54, 8, 8
+; SI-NEXT:    v_bfe_u32 v42, v6, 8, 8
+; SI-NEXT:    v_bfe_u32 v45, v55, 8, 8
+; SI-NEXT:    v_bfe_u32 v61, v2, 8, 8
+; SI-NEXT:    v_bfe_u32 v33, v53, 8, 8
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v7
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s4, v14
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v44
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v44, v7
+; SI-NEXT:    v_readfirstlane_b32 s5, v14
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s5, v53
 ; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_or_b32_e32 v36, v14, v13
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    v_readfirstlane_b32 s6, v14
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    s_lshr_b64 vcc, s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[80:81], s[4:5], 16
+; SI-NEXT:    s_lshr_b32 s49, s5, 8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_or_b32_e32 v54, v14, v16
-; SI-NEXT:    v_cvt_f32_f16_e32 v16, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, v4
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v13
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT:    v_or_b32_e32 v52, v17, v16
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v27, v4
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
-; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; SI-NEXT:    v_or_b32_e32 v56, v17, v19
-; SI-NEXT:    v_alignbit_b32 v40, v56, v52, 24
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v19, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT:    v_readfirstlane_b32 s6, v14
+; SI-NEXT:    s_lshl_b32 s6, s6, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
-; SI-NEXT:    v_or_b32_e32 v42, v21, v19
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s7, v14
+; SI-NEXT:    s_or_b32 s6, s7, s6
+; SI-NEXT:    v_readfirstlane_b32 s7, v3
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v1
-; SI-NEXT:    v_mov_b32_e32 v1, v34
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v8
-; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
-; SI-NEXT:    v_or_b32_e32 v61, v21, v22
-; SI-NEXT:    v_cvt_f32_f16_e32 v22, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s8, v14
+; SI-NEXT:    s_or_b32 s7, s8, s7
+; SI-NEXT:    s_lshr_b64 s[82:83], s[6:7], 24
+; SI-NEXT:    s_lshr_b64 s[84:85], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[86:87], s[6:7], 8
+; SI-NEXT:    s_lshr_b32 s51, s7, 8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v23, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v23, 0x38000000, v23
-; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; SI-NEXT:    v_or_b32_e32 v16, v23, v22
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s8, v14
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v23, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v23, 0x38000000, v23
-; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; SI-NEXT:    v_or_b32_e32 v23, v23, v25
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s9, v14
+; SI-NEXT:    s_or_b32 s8, s9, s8
+; SI-NEXT:    v_readfirstlane_b32 s9, v2
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v24, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s10, v14
+; SI-NEXT:    s_or_b32 s9, s10, s9
+; SI-NEXT:    s_lshr_b64 s[56:57], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[8:9], 8
+; SI-NEXT:    s_lshr_b32 s53, s9, 8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v26, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v26
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
-; SI-NEXT:    v_or_b32_e32 v48, v25, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s10, v14
+; SI-NEXT:    s_lshl_b32 s10, s10, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v26, v3
-; SI-NEXT:    v_add_f32_e32 v26, 0x38000000, v26
-; SI-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; SI-NEXT:    v_or_b32_e32 v25, v26, v25
-; SI-NEXT:    v_cvt_f32_f16_e32 v26, v4
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v26, 0x38000000, v26
-; SI-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; SI-NEXT:    v_or_b32_e32 v53, v26, v27
-; SI-NEXT:    v_mov_b32_e32 v26, v13
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v7
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v28, v4
-; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
-; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
-; SI-NEXT:    v_or_b32_e32 v62, v28, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s11, v14
+; SI-NEXT:    s_or_b32 s10, s11, s10
+; SI-NEXT:    v_readfirstlane_b32 s11, v55
+; SI-NEXT:    s_lshl_b32 s11, s11, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v29, v13
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload
-; SI-NEXT:    v_alignbit_b32 v27, v12, v11, 24
-; SI-NEXT:    v_add_f32_e32 v29, 0x38000000, v29
-; SI-NEXT:    v_cvt_f16_f32_e32 v29, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
-; SI-NEXT:    v_cvt_f32_f16_e32 v29, v63
-; SI-NEXT:    v_add_f32_e32 v29, 0x38000000, v29
-; SI-NEXT:    v_cvt_f16_f32_e32 v29, v29
-; SI-NEXT:    v_or_b32_e32 v59, v29, v34
-; SI-NEXT:    v_add_f32_e32 v34, 0x38000000, v35
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v29, v36
-; SI-NEXT:    v_cvt_f32_f16_e32 v36, v51
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v34
-; SI-NEXT:    v_add_f32_e32 v36, 0x38000000, v36
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v36
-; SI-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cvt_f32_f16_e32 v30, v13
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v30, 0x38000000, v30
-; SI-NEXT:    v_cvt_f16_f32_e32 v30, v30
-; SI-NEXT:    v_or_b32_e32 v3, v30, v28
-; SI-NEXT:    v_cvt_f32_f16_e32 v30, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v32
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v44
-; SI-NEXT:    v_add_f32_e32 v30, 0x38000000, v30
-; SI-NEXT:    v_cvt_f16_f32_e32 v30, v30
-; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT:    v_add_f32_e32 v32, 0x38000000, v32
-; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v35
-; SI-NEXT:    v_or_b32_e32 v4, v34, v30
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v44, v32
-; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v35
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v51, v51
-; SI-NEXT:    v_mov_b32_e32 v30, v10
-; SI-NEXT:    v_mov_b32_e32 v32, v30
-; SI-NEXT:    v_or_b32_e32 v34, v35, v34
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v46
-; SI-NEXT:    v_add_f32_e32 v51, 0x38000000, v51
-; SI-NEXT:    v_cvt_f16_f32_e32 v51, v51
-; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v35
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
-; SI-NEXT:    v_or_b32_e32 v22, v35, v36
-; SI-NEXT:    v_lshlrev_b32_e32 v36, 16, v10
-; SI-NEXT:    v_mov_b32_e32 v35, v16
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s12, v14
+; SI-NEXT:    s_or_b32 s11, s12, s11
+; SI-NEXT:    s_lshr_b64 s[72:73], s[10:11], 24
+; SI-NEXT:    s_lshr_b64 s[74:75], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[10:11], 8
+; SI-NEXT:    s_lshr_b32 s55, s11, 8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v13
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT:    v_or_b32_e32 v24, v37, v36
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v38
-; SI-NEXT:    v_or_b32_e32 v58, v39, v37
-; SI-NEXT:    v_cvt_f32_f16_e32 v39, v45
-; SI-NEXT:    v_mov_b32_e32 v36, v48
-; SI-NEXT:    v_alignbit_b32 v38, v54, v29, 24
-; SI-NEXT:    v_add_f32_e32 v39, 0x38000000, v39
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v39
-; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s12, v14
+; SI-NEXT:    s_lshl_b32 s12, s12, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v13
-; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT:    v_or_b32_e32 v60, v37, v39
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v48
-; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v49
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT:    v_alignbit_b32 v39, v54, v29, 16
-; SI-NEXT:    v_or_b32_e32 v43, v48, v37
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v44
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v49, v49
-; SI-NEXT:    v_add_f32_e32 v49, 0x38000000, v49
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, v49
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s13, v14
+; SI-NEXT:    s_or_b32 s12, s13, s12
+; SI-NEXT:    v_readfirstlane_b32 s13, v5
+; SI-NEXT:    s_lshl_b32 s13, s13, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v37
-; SI-NEXT:    v_or_b32_e32 v14, v49, v48
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload
-; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload
-; SI-NEXT:    v_alignbit_b32 v28, v14, v43, 8
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v37
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v48
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s14, v14
+; SI-NEXT:    s_or_b32 s13, s14, s13
+; SI-NEXT:    s_lshr_b64 s[78:79], s[12:13], 24
+; SI-NEXT:    s_lshr_b64 s[88:89], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[90:91], s[12:13], 8
+; SI-NEXT:    s_lshr_b32 s65, s13, 8
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s14, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v51
+; SI-NEXT:    s_lshl_b32 s14, s14, 16
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s15, v14
+; SI-NEXT:    s_or_b32 s14, s15, s14
+; SI-NEXT:    v_readfirstlane_b32 s15, v6
+; SI-NEXT:    s_lshl_b32 s15, s15, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v49, v49
-; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v48
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT:    v_add_f32_e32 v49, 0x38000000, v49
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, v49
-; SI-NEXT:    v_or_b32_e32 v19, v48, v37
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v33
-; SI-NEXT:    v_or_b32_e32 v47, v49, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v49, 16, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s16, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v41
+; SI-NEXT:    s_or_b32 s15, s16, s15
+; SI-NEXT:    s_lshr_b64 s[92:93], s[14:15], 24
+; SI-NEXT:    s_lshr_b64 s[94:95], s[14:15], 16
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    s_lshr_b64 s[30:31], s[14:15], 8
+; SI-NEXT:    s_lshr_b32 s67, s15, 8
+; SI-NEXT:    v_readfirstlane_b32 s16, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v21
+; SI-NEXT:    s_lshl_b32 s16, s16, 16
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s17, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v16
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_readfirstlane_b32 s17, v44
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s18, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v40
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_lshr_b64 s[34:35], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[36:37], s[16:17], 16
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    s_lshr_b64 s[38:39], s[16:17], 8
+; SI-NEXT:    s_lshr_b32 s48, s17, 8
+; SI-NEXT:    v_readfirstlane_b32 s18, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v60
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s19, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v59
+; SI-NEXT:    s_or_b32 s18, s19, s18
+; SI-NEXT:    v_readfirstlane_b32 s19, v54
+; SI-NEXT:    s_lshl_b32 s19, s19, 16
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s20, v14
+; SI-NEXT:    s_or_b32 s19, s20, s19
+; SI-NEXT:    s_lshr_b64 s[96:97], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[98:99], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[18:19], 8
+; SI-NEXT:    s_lshr_b32 s50, s19, 8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v48
-; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v48
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v48
-; SI-NEXT:    v_or_b32_e32 v21, v50, v37
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v37
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v48
-; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v48
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT:    v_or_b32_e32 v16, v37, v49
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v50, v50
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v48
-; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v50
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT:    v_alignbit_b32 v50, v12, v11, 16
-; SI-NEXT:    v_or_b32_e32 v57, v48, v37
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v18
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v49, v49
-; SI-NEXT:    v_add_f32_e32 v49, 0x38000000, v49
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, v49
-; SI-NEXT:    v_or_b32_e32 v17, v49, v48
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v37
-; SI-NEXT:    v_mov_b32_e32 v49, v53
-; SI-NEXT:    v_alignbit_b32 v53, v54, v29, 8
-; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v37
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s20, v14
+; SI-NEXT:    s_lshl_b32 s20, s20, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v48
-; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v48
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT:    v_or_b32_e32 v13, v48, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v20
-; SI-NEXT:    v_or_b32_e32 v55, v51, v37
-; SI-NEXT:    v_alignbit_b32 v10, v55, v13, 16
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v55, v13, 8
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v17, v57, 24
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v17, v57, 16
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v17, v57, 8
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v16, v21, 24
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v16, v21, 16
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v16, v21, 8
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v47, v19, 24
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v47, v19, 16
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v47, v19, 8
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v14, v43, 24
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v14, v43, 16
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v60, v58, 24
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v60, v58, 16
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v60, v58, 8
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v24, v22, 24
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v24, v22, 16
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v24, v22, 8
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v34, v4, 24
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v34, v4, 16
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v34, v4, 8
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v59, v3, 24
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v59, v3, 16
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v59, v3, 8
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v62, v49, 24
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v62, v49, 16
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v62, v49, 8
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v25, v36, 24
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v25, v36, 16
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v25, v36, 8
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v23, v35, 24
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v23, v35, 16
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v23, v35, 8
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v61, v42, 24
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v61, v42, 16
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_alignbit_b32 v10, v61, v42, 8
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v10, v56, v52, 16
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_alignbit_b32 v10, v56, v52, 8
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v55
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v17
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v16
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v47
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v14
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v60
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v24
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v59
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v62
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v25
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v23
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v61
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v56
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v54
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v12
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v10, v20, 8, 8
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v10, v18, 8, 8
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v10, v15, 8, 8
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v10, v33, 8, 8
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v10, v44, 8, 8
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v10, v31, 8, 8
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v10, v30, 8, 8
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v10, v9, 8, 8
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v10, v8, 8, 8
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v10, v6, 8, 8
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v10, v5, 8, 8
-; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_bfe_u32 v10, v26, 8, 8
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v10, v2, 8, 8
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v48, v55, v13, 24
-; SI-NEXT:    v_alignbit_b32 v42, v12, v11, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v29, 8, v34
-; SI-NEXT:    v_bfe_u32 v30, v7, 8, 8
-; SI-NEXT:    v_mov_b32_e32 v26, v1
-; SI-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
-; SI-NEXT:    v_bfe_u32 v10, v37, 8, 8
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v10, v1, 8, 8
-; SI-NEXT:    v_mov_b32_e32 v1, v37
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:  .LBB95_5: ; %end
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_and_b32_e32 v37, 0xff, v13
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v52, 24, v48
-; SI-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT:    v_and_b32_e32 v18, 0xff, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; SI-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v51, 8, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v37, v37, v51
-; SI-NEXT:    v_and_b32_e32 v37, 0xffff, v37
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v51, 0xff, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; SI-NEXT:    v_or_b32_e32 v51, v52, v51
-; SI-NEXT:    v_or_b32_e32 v37, v37, v51
-; SI-NEXT:    buffer_store_dword v37, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v37, 0xff, v55
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v51, 8, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v37, v37, v51
-; SI-NEXT:    v_and_b32_e32 v37, 0xffff, v37
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 24, v10
-; SI-NEXT:    v_or_b32_e32 v20, v48, v20
-; SI-NEXT:    v_or_b32_e32 v20, v37, v20
-; SI-NEXT:    v_add_i32_e32 v37, vcc, 4, v0
-; SI-NEXT:    buffer_store_dword v20, v37, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v20, 0xff, v57
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 8, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v20, v20, v37
-; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v37, 0xff, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v37
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 24, v10
-; SI-NEXT:    v_or_b32_e32 v37, v48, v37
-; SI-NEXT:    v_or_b32_e32 v20, v20, v37
-; SI-NEXT:    v_add_i32_e32 v37, vcc, 8, v0
-; SI-NEXT:    buffer_store_dword v20, v37, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v20, 0xff, v17
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 8, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v20, v20, v37
-; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 24, v10
-; SI-NEXT:    v_or_b32_e32 v18, v37, v18
-; SI-NEXT:    v_or_b32_e32 v18, v20, v18
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 12, v0
-; SI-NEXT:    buffer_store_dword v18, v20, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xff, v21
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 8, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v18, v18, v20
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v20, 0xff, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s21, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v58
+; SI-NEXT:    s_or_b32 s20, s21, s20
+; SI-NEXT:    v_readfirstlane_b32 s21, v31
+; SI-NEXT:    s_lshl_b32 s21, s21, 16
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s22, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v39
+; SI-NEXT:    s_or_b32 s21, s22, s21
+; SI-NEXT:    s_lshr_b64 s[62:63], s[20:21], 8
+; SI-NEXT:    s_lshr_b32 s52, s21, 8
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s22, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v29
+; SI-NEXT:    s_lshl_b32 s22, s22, 16
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s23, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v52
+; SI-NEXT:    s_or_b32 s22, s23, s22
+; SI-NEXT:    v_readfirstlane_b32 s23, v32
+; SI-NEXT:    s_lshl_b32 s23, s23, 16
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s24, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v17
+; SI-NEXT:    s_or_b32 s23, s24, s23
+; SI-NEXT:    s_lshr_b32 s54, s23, 8
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s24, v14
+; SI-NEXT:    s_lshl_b32 s24, s24, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 24, v10
-; SI-NEXT:    v_or_b32_e32 v20, v37, v20
-; SI-NEXT:    v_or_b32_e32 v18, v18, v20
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 16, v0
-; SI-NEXT:    buffer_store_dword v18, v20, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xff, v16
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s25, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v30
+; SI-NEXT:    s_or_b32 s24, s25, s24
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v30, v14
+; SI-NEXT:    v_readfirstlane_b32 s25, v30
+; SI-NEXT:    s_lshl_b32 s25, s25, 16
+; SI-NEXT:    v_bfe_u32 v35, v30, 8, 8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 8, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v18, v18, v20
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s26, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v22
+; SI-NEXT:    s_or_b32 s25, s26, s25
+; SI-NEXT:    s_lshr_b32 s64, s25, 8
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s26, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v23
+; SI-NEXT:    s_lshl_b32 s26, s26, 16
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s27, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v46
+; SI-NEXT:    s_or_b32 s26, s27, s26
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v46, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v18
+; SI-NEXT:    v_bfe_u32 v18, v44, 8, 8
+; SI-NEXT:    v_readfirstlane_b32 s27, v46
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    s_lshl_b32 s27, s27, 16
+; SI-NEXT:    v_bfe_u32 v37, v46, 8, 8
+; SI-NEXT:    v_readfirstlane_b32 s28, v14
+; SI-NEXT:    s_or_b32 s27, s28, s27
+; SI-NEXT:    s_lshr_b32 s66, s27, 8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 24, v10
-; SI-NEXT:    v_or_b32_e32 v15, v20, v15
-; SI-NEXT:    v_or_b32_e32 v15, v18, v15
-; SI-NEXT:    v_add_i32_e32 v18, vcc, 20, v0
-; SI-NEXT:    buffer_store_dword v15, v18, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v15, 0xff, v19
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s28, v14
+; SI-NEXT:    s_lshl_b32 s28, s28, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v15, v15, v18
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s29, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v56
+; SI-NEXT:    s_or_b32 s28, s29, s28
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v15
+; SI-NEXT:    v_bfe_u32 v15, v5, 8, 8
+; SI-NEXT:    v_readfirstlane_b32 s29, v56
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    s_lshl_b32 s29, s29, 16
+; SI-NEXT:    v_bfe_u32 v38, v56, 8, 8
+; SI-NEXT:    v_readfirstlane_b32 s40, v14
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    s_or_b32 s29, s40, s29
+; SI-NEXT:    s_lshr_b32 s68, s29, 8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xff, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s40, v14
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    s_lshl_b32 s40, s40, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 24, v10
-; SI-NEXT:    v_or_b32_e32 v18, v20, v18
-; SI-NEXT:    v_or_b32_e32 v15, v15, v18
-; SI-NEXT:    v_add_i32_e32 v18, vcc, 24, v0
-; SI-NEXT:    buffer_store_dword v15, v18, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v15, 0xff, v47
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s41, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT:    s_or_b32 s40, s41, s40
+; SI-NEXT:    v_readfirstlane_b32 s41, v57
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    s_lshl_b32 s41, s41, 16
+; SI-NEXT:    v_readfirstlane_b32 s42, v14
+; SI-NEXT:    s_or_b32 s41, s42, s41
+; SI-NEXT:    s_lshr_b32 s69, s41, 8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v15, v15, v18
-; SI-NEXT:    v_and_b32_e32 v18, 0xff, v33
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s42, v14
+; SI-NEXT:    s_lshl_b32 s42, s42, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 24, v10
-; SI-NEXT:    v_or_b32_e32 v18, v20, v18
-; SI-NEXT:    v_or_b32_e32 v15, v15, v18
-; SI-NEXT:    v_add_i32_e32 v18, vcc, 28, v0
-; SI-NEXT:    buffer_store_dword v15, v18, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v15, 0xff, v43
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v28
-; SI-NEXT:    v_or_b32_e32 v15, v15, v18
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s43, v14
+; SI-NEXT:    s_or_b32 s42, s43, s42
+; SI-NEXT:    v_readfirstlane_b32 s43, v12
+; SI-NEXT:    s_lshl_b32 s43, s43, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xff, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s44, v14
+; SI-NEXT:    s_or_b32 s43, s44, s43
+; SI-NEXT:    s_lshr_b32 s70, s43, 8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 24, v10
-; SI-NEXT:    v_or_b32_e32 v18, v20, v18
-; SI-NEXT:    v_or_b32_e32 v15, v15, v18
-; SI-NEXT:    v_add_i32_e32 v18, vcc, 32, v0
-; SI-NEXT:    buffer_store_dword v15, v18, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v15, 0xff, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s44, v14
+; SI-NEXT:    s_lshl_b32 s44, s44, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v15, v15, v18
-; SI-NEXT:    v_and_b32_e32 v18, 0xff, v44
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v1
+; SI-NEXT:    v_mov_b32_e32 v1, v3
+; SI-NEXT:    v_bfe_u32 v47, v1, 8, 8
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s45, v14
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    s_or_b32 s44, s45, s44
+; SI-NEXT:    v_readfirstlane_b32 s45, v13
+; SI-NEXT:    s_lshl_b32 s45, s45, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 24, v10
-; SI-NEXT:    v_or_b32_e32 v18, v20, v18
-; SI-NEXT:    v_or_b32_e32 v15, v15, v18
-; SI-NEXT:    v_add_i32_e32 v18, vcc, 36, v0
-; SI-NEXT:    buffer_store_dword v15, v18, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_readfirstlane_b32 s46, v14
+; SI-NEXT:    s_or_b32 s45, s46, s45
+; SI-NEXT:    s_lshr_b64 s[46:47], s[44:45], 24
+; SI-NEXT:    v_writelane_b32 v62, s46, 4
+; SI-NEXT:    v_writelane_b32 v62, s47, 5
+; SI-NEXT:    s_lshr_b64 s[46:47], s[44:45], 16
+; SI-NEXT:    v_writelane_b32 v62, s46, 2
+; SI-NEXT:    v_writelane_b32 v62, s47, 3
+; SI-NEXT:    s_lshr_b64 s[46:47], s[44:45], 8
+; SI-NEXT:    v_writelane_b32 v62, s46, 0
+; SI-NEXT:    v_writelane_b32 v62, s47, 1
+; SI-NEXT:    s_lshr_b64 s[46:47], s[42:43], 24
+; SI-NEXT:    v_writelane_b32 v62, s46, 10
+; SI-NEXT:    v_writelane_b32 v62, s47, 11
+; SI-NEXT:    s_lshr_b64 s[46:47], s[42:43], 16
+; SI-NEXT:    v_writelane_b32 v62, s46, 8
+; SI-NEXT:    v_writelane_b32 v62, s47, 9
+; SI-NEXT:    s_lshr_b64 s[46:47], s[42:43], 8
+; SI-NEXT:    v_writelane_b32 v62, s46, 6
+; SI-NEXT:    v_writelane_b32 v62, s47, 7
+; SI-NEXT:    s_lshr_b64 s[46:47], s[40:41], 24
+; SI-NEXT:    v_writelane_b32 v62, s46, 16
+; SI-NEXT:    v_writelane_b32 v62, s47, 17
+; SI-NEXT:    s_lshr_b64 s[46:47], s[40:41], 16
+; SI-NEXT:    v_writelane_b32 v62, s46, 14
+; SI-NEXT:    v_writelane_b32 v62, s47, 15
+; SI-NEXT:    s_lshr_b64 s[46:47], s[40:41], 8
+; SI-NEXT:    v_writelane_b32 v62, s46, 12
+; SI-NEXT:    v_writelane_b32 v62, s47, 13
+; SI-NEXT:    s_lshr_b64 s[46:47], s[28:29], 24
+; SI-NEXT:    v_writelane_b32 v62, s46, 22
+; SI-NEXT:    v_writelane_b32 v62, s47, 23
+; SI-NEXT:    s_lshr_b64 s[46:47], s[28:29], 16
+; SI-NEXT:    v_writelane_b32 v62, s46, 20
+; SI-NEXT:    v_writelane_b32 v62, s47, 21
+; SI-NEXT:    s_lshr_b64 s[46:47], s[28:29], 8
+; SI-NEXT:    v_writelane_b32 v62, s46, 18
+; SI-NEXT:    v_writelane_b32 v62, s47, 19
+; SI-NEXT:    s_lshr_b64 s[46:47], s[26:27], 24
+; SI-NEXT:    v_writelane_b32 v62, s46, 28
+; SI-NEXT:    v_writelane_b32 v62, s47, 29
+; SI-NEXT:    s_lshr_b64 s[46:47], s[26:27], 16
+; SI-NEXT:    v_writelane_b32 v62, s46, 26
+; SI-NEXT:    v_writelane_b32 v62, s47, 27
+; SI-NEXT:    s_lshr_b64 s[46:47], s[26:27], 8
+; SI-NEXT:    v_writelane_b32 v62, s46, 24
+; SI-NEXT:    v_writelane_b32 v62, s47, 25
+; SI-NEXT:    s_lshr_b64 s[46:47], s[24:25], 24
+; SI-NEXT:    v_writelane_b32 v62, s46, 34
+; SI-NEXT:    v_writelane_b32 v62, s47, 35
+; SI-NEXT:    s_lshr_b64 s[46:47], s[24:25], 16
+; SI-NEXT:    v_writelane_b32 v62, s46, 32
+; SI-NEXT:    v_writelane_b32 v62, s47, 33
+; SI-NEXT:    s_lshr_b64 s[46:47], s[24:25], 8
+; SI-NEXT:    v_writelane_b32 v62, s46, 30
+; SI-NEXT:    v_writelane_b32 v62, s47, 31
+; SI-NEXT:    s_lshr_b64 s[46:47], s[22:23], 24
+; SI-NEXT:    v_writelane_b32 v62, s46, 40
+; SI-NEXT:    v_writelane_b32 v62, s47, 41
+; SI-NEXT:    s_lshr_b64 s[46:47], s[22:23], 16
+; SI-NEXT:    v_writelane_b32 v62, s46, 38
+; SI-NEXT:    v_writelane_b32 v62, s47, 39
+; SI-NEXT:    s_lshr_b64 s[46:47], s[22:23], 8
+; SI-NEXT:    v_writelane_b32 v62, s46, 36
+; SI-NEXT:    v_writelane_b32 v62, s47, 37
+; SI-NEXT:    s_lshr_b64 s[46:47], s[20:21], 24
+; SI-NEXT:    v_writelane_b32 v62, s46, 44
+; SI-NEXT:    v_writelane_b32 v62, s47, 45
+; SI-NEXT:    s_lshr_b64 s[46:47], s[20:21], 16
+; SI-NEXT:    v_writelane_b32 v62, s46, 42
+; SI-NEXT:    v_writelane_b32 v62, s47, 43
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 48
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 49
+; SI-NEXT:    s_lshr_b64 vcc, s[4:5], 8
+; SI-NEXT:    v_writelane_b32 v62, vcc_lo, 46
+; SI-NEXT:    s_lshr_b64 s[46:47], s[8:9], 24
+; SI-NEXT:    v_writelane_b32 v62, vcc_hi, 47
+; SI-NEXT:    s_lshr_b32 s71, s45, 8
+; SI-NEXT:  .LBB95_5: ; %end
+; SI-NEXT:    v_readlane_b32 vcc_lo, v62, 0
+; SI-NEXT:    v_readlane_b32 vcc_hi, v62, 1
+; SI-NEXT:    s_lshl_b32 s47, vcc_lo, 8
+; SI-NEXT:    v_readlane_b32 vcc_lo, v62, 2
+; SI-NEXT:    s_and_b32 s44, s44, 0xff
+; SI-NEXT:    v_readlane_b32 vcc_hi, v62, 3
+; SI-NEXT:    s_or_b32 s44, s44, s47
+; SI-NEXT:    s_and_b32 s47, vcc_lo, 0xff
+; SI-NEXT:    v_readlane_b32 vcc_lo, v62, 4
+; SI-NEXT:    s_lshl_b32 s57, vcc_lo, 24
+; SI-NEXT:    s_lshl_b32 s47, s47, 16
+; SI-NEXT:    s_or_b32 s47, s57, s47
+; SI-NEXT:    s_and_b32 s44, s44, 0xffff
+; SI-NEXT:    s_or_b32 s44, s44, s47
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v16, s44
+; SI-NEXT:    s_and_b32 s44, s45, 0xff
+; SI-NEXT:    s_lshl_b32 s45, s71, 8
+; SI-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; SI-NEXT:    s_or_b32 s44, s44, s45
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 24, v24
+; SI-NEXT:    s_and_b32 s44, s44, 0xffff
+; SI-NEXT:    v_or_b32_e32 v13, v21, v13
+; SI-NEXT:    v_or_b32_e32 v13, s44, v13
+; SI-NEXT:    v_readlane_b32 s44, v62, 6
+; SI-NEXT:    v_readlane_b32 s45, v62, 7
+; SI-NEXT:    s_lshl_b32 s44, s44, 8
+; SI-NEXT:    s_and_b32 s42, s42, 0xff
+; SI-NEXT:    v_readlane_b32 vcc_hi, v62, 5
+; SI-NEXT:    s_or_b32 s42, s42, s44
+; SI-NEXT:    v_readlane_b32 s44, v62, 8
+; SI-NEXT:    v_readlane_b32 s45, v62, 9
+; SI-NEXT:    s_and_b32 s44, s44, 0xff
+; SI-NEXT:    v_readlane_b32 vcc_lo, v62, 10
+; SI-NEXT:    s_lshl_b32 s45, vcc_lo, 24
+; SI-NEXT:    s_lshl_b32 s44, s44, 16
+; SI-NEXT:    s_or_b32 s44, s45, s44
+; SI-NEXT:    s_and_b32 s42, s42, 0xffff
+; SI-NEXT:    s_or_b32 s42, s42, s44
+; SI-NEXT:    v_mov_b32_e32 v21, s42
+; SI-NEXT:    s_and_b32 s42, s43, 0xff
+; SI-NEXT:    s_lshl_b32 s43, s70, 8
+; SI-NEXT:    v_and_b32_e32 v12, 0xff, v12
+; SI-NEXT:    s_or_b32 s42, s42, s43
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 24, v48
+; SI-NEXT:    s_and_b32 s42, s42, 0xffff
+; SI-NEXT:    v_or_b32_e32 v12, v23, v12
+; SI-NEXT:    v_or_b32_e32 v12, s42, v12
+; SI-NEXT:    v_readlane_b32 s42, v62, 12
+; SI-NEXT:    v_readlane_b32 s43, v62, 13
+; SI-NEXT:    s_lshl_b32 s42, s42, 8
+; SI-NEXT:    s_and_b32 s40, s40, 0xff
+; SI-NEXT:    s_or_b32 s40, s40, s42
+; SI-NEXT:    v_readlane_b32 s42, v62, 14
+; SI-NEXT:    v_readlane_b32 s43, v62, 15
+; SI-NEXT:    s_and_b32 s42, s42, 0xff
+; SI-NEXT:    v_readlane_b32 s44, v62, 16
+; SI-NEXT:    s_lshl_b32 s43, s44, 24
+; SI-NEXT:    s_lshl_b32 s42, s42, 16
+; SI-NEXT:    s_or_b32 s42, s43, s42
+; SI-NEXT:    s_and_b32 s40, s40, 0xffff
+; SI-NEXT:    s_or_b32 s40, s40, s42
+; SI-NEXT:    v_mov_b32_e32 v23, s40
+; SI-NEXT:    s_and_b32 s40, s41, 0xff
+; SI-NEXT:    s_lshl_b32 s41, s69, 8
+; SI-NEXT:    v_and_b32_e32 v11, 0xff, v57
+; SI-NEXT:    s_or_b32 s40, s40, s41
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v24, 24, v50
+; SI-NEXT:    s_and_b32 s40, s40, 0xffff
+; SI-NEXT:    v_or_b32_e32 v11, v24, v11
+; SI-NEXT:    v_or_b32_e32 v11, s40, v11
+; SI-NEXT:    v_readlane_b32 s40, v62, 18
+; SI-NEXT:    v_readlane_b32 s41, v62, 19
+; SI-NEXT:    s_lshl_b32 s40, s40, 8
+; SI-NEXT:    s_and_b32 s28, s28, 0xff
+; SI-NEXT:    s_or_b32 s28, s28, s40
+; SI-NEXT:    v_readlane_b32 s40, v62, 20
+; SI-NEXT:    v_readlane_b32 s41, v62, 21
+; SI-NEXT:    s_and_b32 s40, s40, 0xff
+; SI-NEXT:    v_readlane_b32 s42, v62, 22
+; SI-NEXT:    s_lshl_b32 s41, s42, 24
+; SI-NEXT:    s_lshl_b32 s40, s40, 16
+; SI-NEXT:    s_or_b32 s40, s41, s40
+; SI-NEXT:    s_and_b32 s28, s28, 0xffff
+; SI-NEXT:    s_or_b32 s28, s28, s40
+; SI-NEXT:    v_mov_b32_e32 v24, s28
+; SI-NEXT:    s_and_b32 s28, s29, 0xff
+; SI-NEXT:    s_lshl_b32 s29, s68, 8
+; SI-NEXT:    v_and_b32_e32 v25, 0xff, v56
+; SI-NEXT:    s_or_b32 s28, s28, s29
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 24, v38
+; SI-NEXT:    s_and_b32 s28, s28, 0xffff
+; SI-NEXT:    v_or_b32_e32 v25, v26, v25
+; SI-NEXT:    v_or_b32_e32 v25, s28, v25
+; SI-NEXT:    v_readlane_b32 s28, v62, 24
+; SI-NEXT:    v_readlane_b32 s29, v62, 25
+; SI-NEXT:    s_lshl_b32 s28, s28, 8
+; SI-NEXT:    s_and_b32 s26, s26, 0xff
+; SI-NEXT:    s_or_b32 s26, s26, s28
+; SI-NEXT:    v_readlane_b32 s28, v62, 26
+; SI-NEXT:    v_readlane_b32 s29, v62, 27
+; SI-NEXT:    s_and_b32 s28, s28, 0xff
+; SI-NEXT:    v_readlane_b32 s40, v62, 28
+; SI-NEXT:    s_lshl_b32 s29, s40, 24
+; SI-NEXT:    s_lshl_b32 s28, s28, 16
+; SI-NEXT:    s_or_b32 s28, s29, s28
+; SI-NEXT:    s_and_b32 s26, s26, 0xffff
+; SI-NEXT:    s_or_b32 s26, s26, s28
+; SI-NEXT:    v_mov_b32_e32 v26, s26
+; SI-NEXT:    s_and_b32 s26, s27, 0xff
+; SI-NEXT:    s_lshl_b32 s27, s66, 8
+; SI-NEXT:    v_and_b32_e32 v27, 0xff, v46
+; SI-NEXT:    s_or_b32 s26, s26, s27
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v28, 24, v37
+; SI-NEXT:    s_and_b32 s26, s26, 0xffff
+; SI-NEXT:    v_or_b32_e32 v27, v28, v27
+; SI-NEXT:    v_or_b32_e32 v27, s26, v27
+; SI-NEXT:    v_readlane_b32 s26, v62, 30
+; SI-NEXT:    v_readlane_b32 s27, v62, 31
+; SI-NEXT:    s_lshl_b32 s26, s26, 8
+; SI-NEXT:    s_and_b32 s24, s24, 0xff
+; SI-NEXT:    s_or_b32 s24, s24, s26
+; SI-NEXT:    v_readlane_b32 s26, v62, 32
+; SI-NEXT:    v_readlane_b32 s27, v62, 33
+; SI-NEXT:    s_and_b32 s26, s26, 0xff
+; SI-NEXT:    v_readlane_b32 s28, v62, 34
+; SI-NEXT:    s_lshl_b32 s27, s28, 24
+; SI-NEXT:    s_lshl_b32 s26, s26, 16
+; SI-NEXT:    v_readlane_b32 vcc_hi, v62, 11
+; SI-NEXT:    s_or_b32 s26, s27, s26
+; SI-NEXT:    s_and_b32 s24, s24, 0xffff
+; SI-NEXT:    s_or_b32 s24, s24, s26
+; SI-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v15, 0xff, v58
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v15, v15, v18
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xff, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 24, v10
-; SI-NEXT:    v_or_b32_e32 v18, v20, v18
-; SI-NEXT:    v_or_b32_e32 v15, v15, v18
-; SI-NEXT:    v_add_i32_e32 v18, vcc, 40, v0
-; SI-NEXT:    buffer_store_dword v15, v18, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v16, vcc, 4, v0
+; SI-NEXT:    v_mov_b32_e32 v28, s24
+; SI-NEXT:    s_and_b32 s24, s25, 0xff
+; SI-NEXT:    s_lshl_b32 s25, s64, 8
+; SI-NEXT:    v_and_b32_e32 v29, 0xff, v30
+; SI-NEXT:    buffer_store_dword v13, v16, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v15, 0xff, v60
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v15, v15, v18
-; SI-NEXT:    v_and_b32_e32 v18, 0xff, v31
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 24, v10
-; SI-NEXT:    v_or_b32_e32 v18, v20, v18
-; SI-NEXT:    v_or_b32_e32 v15, v15, v18
-; SI-NEXT:    v_add_i32_e32 v18, vcc, 44, v0
-; SI-NEXT:    buffer_store_dword v15, v18, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v13, vcc, 8, v0
+; SI-NEXT:    s_or_b32 s24, s24, s25
+; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; SI-NEXT:    v_lshlrev_b32_e32 v30, 24, v35
+; SI-NEXT:    buffer_store_dword v21, v13, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v13, vcc, 12, v0
+; SI-NEXT:    s_and_b32 s24, s24, 0xffff
+; SI-NEXT:    v_or_b32_e32 v29, v30, v29
+; SI-NEXT:    buffer_store_dword v12, v13, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v15, 0xff, v22
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v15, v15, v18
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xff, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 24, v10
-; SI-NEXT:    v_or_b32_e32 v18, v20, v18
-; SI-NEXT:    v_or_b32_e32 v15, v15, v18
-; SI-NEXT:    v_add_i32_e32 v18, vcc, 48, v0
-; SI-NEXT:    buffer_store_dword v15, v18, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v12, vcc, 16, v0
+; SI-NEXT:    v_or_b32_e32 v29, s24, v29
+; SI-NEXT:    buffer_store_dword v23, v12, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v12, vcc, 20, v0
+; SI-NEXT:    v_readlane_b32 s24, v62, 36
+; SI-NEXT:    buffer_store_dword v11, v12, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v15, 0xff, v24
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v10
+; SI-NEXT:    v_add_i32_e32 v11, vcc, 24, v0
+; SI-NEXT:    v_readlane_b32 s25, v62, 37
+; SI-NEXT:    s_lshl_b32 s24, s24, 8
+; SI-NEXT:    s_and_b32 s22, s22, 0xff
+; SI-NEXT:    buffer_store_dword v24, v11, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v11, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s22, s22, s24
+; SI-NEXT:    v_readlane_b32 s24, v62, 38
+; SI-NEXT:    buffer_store_dword v25, v11, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v11, vcc, 32, v0
+; SI-NEXT:    v_readlane_b32 s25, v62, 39
+; SI-NEXT:    s_and_b32 s24, s24, 0xff
+; SI-NEXT:    v_readlane_b32 s26, v62, 40
+; SI-NEXT:    buffer_store_dword v26, v11, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v11, vcc, 36, v0
+; SI-NEXT:    s_lshl_b32 s24, s24, 16
+; SI-NEXT:    s_lshl_b32 s25, s26, 24
+; SI-NEXT:    buffer_store_dword v27, v11, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v11, vcc, 40, v0
+; SI-NEXT:    s_and_b32 s22, s22, 0xffff
+; SI-NEXT:    s_or_b32 s24, s25, s24
+; SI-NEXT:    buffer_store_dword v28, v11, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v11, vcc, 44, v0
+; SI-NEXT:    s_or_b32 s22, s22, s24
+; SI-NEXT:    buffer_store_dword v29, v11, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v11, vcc, 48, v0
+; SI-NEXT:    v_mov_b32_e32 v12, s22
+; SI-NEXT:    s_and_b32 s22, s23, 0xff
+; SI-NEXT:    s_lshl_b32 s23, s54, 8
 ; SI-NEXT:    v_and_b32_e32 v10, 0xff, v32
-; SI-NEXT:    v_or_b32_e32 v15, v15, v18
+; SI-NEXT:    buffer_store_dword v12, v11, s[0:3], 0 offen
+; SI-NEXT:    s_or_b32 s22, s22, s23
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 24, v13
-; SI-NEXT:    v_or_b32_e32 v10, v18, v10
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; SI-NEXT:    v_or_b32_e32 v10, v15, v10
-; SI-NEXT:    v_add_i32_e32 v15, vcc, 52, v0
-; SI-NEXT:    buffer_store_dword v10, v15, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v10, 0xff, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 8, v13
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v10, v10, v15
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v15, 0xff, v13
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 24, v13
-; SI-NEXT:    v_or_b32_e32 v15, v18, v15
-; SI-NEXT:    v_or_b32_e32 v10, v10, v15
-; SI-NEXT:    v_add_i32_e32 v15, vcc, 56, v0
-; SI-NEXT:    buffer_store_dword v10, v15, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v43
+; SI-NEXT:    s_and_b32 s22, s22, 0xffff
+; SI-NEXT:    v_or_b32_e32 v10, v11, v10
+; SI-NEXT:    v_or_b32_e32 v10, s22, v10
+; SI-NEXT:    s_and_b32 s20, s20, 0xff
+; SI-NEXT:    s_lshl_b32 s22, s62, 8
+; SI-NEXT:    s_or_b32 s20, s20, s22
+; SI-NEXT:    v_readlane_b32 s22, v62, 42
+; SI-NEXT:    v_readlane_b32 s23, v62, 43
+; SI-NEXT:    s_and_b32 s22, s22, 0xff
+; SI-NEXT:    v_readlane_b32 s24, v62, 44
+; SI-NEXT:    s_lshl_b32 s22, s22, 16
+; SI-NEXT:    s_lshl_b32 s23, s24, 24
+; SI-NEXT:    s_and_b32 s20, s20, 0xffff
+; SI-NEXT:    s_or_b32 s22, s23, s22
+; SI-NEXT:    v_add_i32_e32 v11, vcc, 52, v0
+; SI-NEXT:    s_or_b32 s20, s20, s22
+; SI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v10, 0xff, v34
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 8, v29
-; SI-NEXT:    v_or_b32_e32 v10, v10, v15
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 24, v13
-; SI-NEXT:    v_or_b32_e32 v9, v15, v9
+; SI-NEXT:    v_add_i32_e32 v10, vcc, 56, v0
+; SI-NEXT:    v_mov_b32_e32 v11, s20
+; SI-NEXT:    s_and_b32 s20, s21, 0xff
+; SI-NEXT:    s_lshl_b32 s21, s52, 8
+; SI-NEXT:    v_and_b32_e32 v9, 0xff, v31
+; SI-NEXT:    buffer_store_dword v11, v10, s[0:3], 0 offen
+; SI-NEXT:    s_or_b32 s20, s20, s21
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v20
+; SI-NEXT:    s_and_b32 s20, s20, 0xffff
 ; SI-NEXT:    v_or_b32_e32 v9, v10, v9
+; SI-NEXT:    v_or_b32_e32 v9, s20, v9
+; SI-NEXT:    s_and_b32 s18, s18, 0xff
+; SI-NEXT:    s_lshl_b32 s20, s58, 8
+; SI-NEXT:    s_or_b32 s18, s18, s20
+; SI-NEXT:    s_and_b32 s20, s98, 0xff
+; SI-NEXT:    s_lshl_b32 s20, s20, 16
+; SI-NEXT:    s_lshl_b32 s21, s96, 24
+; SI-NEXT:    s_and_b32 s18, s18, 0xffff
+; SI-NEXT:    s_or_b32 s20, s21, s20
 ; SI-NEXT:    v_add_i32_e32 v10, vcc, 60, v0
+; SI-NEXT:    s_or_b32 s18, s18, s20
 ; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v9, 0xff, v3
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
-; SI-NEXT:    v_or_b32_e32 v9, v9, v10
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 24, v13
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT:    v_or_b32_e32 v10, v15, v10
-; SI-NEXT:    v_or_b32_e32 v9, v9, v10
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 64, v0
-; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v9, 0xff, v59
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 8, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v9, v9, v10
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v3
-; SI-NEXT:    v_or_b32_e32 v8, v10, v8
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 64, v0
+; SI-NEXT:    v_mov_b32_e32 v10, s18
+; SI-NEXT:    s_and_b32 s18, s19, 0xff
+; SI-NEXT:    s_lshl_b32 s19, s50, 8
+; SI-NEXT:    v_and_b32_e32 v8, 0xff, v54
+; SI-NEXT:    buffer_store_dword v10, v9, s[0:3], 0 offen
+; SI-NEXT:    s_or_b32 s18, s18, s19
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 24, v19
+; SI-NEXT:    s_and_b32 s18, s18, 0xffff
 ; SI-NEXT:    v_or_b32_e32 v8, v9, v8
+; SI-NEXT:    v_or_b32_e32 v8, s18, v8
+; SI-NEXT:    s_and_b32 s16, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s38, 8
+; SI-NEXT:    s_or_b32 s16, s16, s18
+; SI-NEXT:    s_and_b32 s18, s36, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_lshl_b32 s19, s34, 24
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s18, s19, s18
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 0x44, v0
+; SI-NEXT:    s_or_b32 s16, s16, s18
 ; SI-NEXT:    buffer_store_dword v8, v9, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v49
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v4
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v8, v8, v9
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v9, 0xff, v4
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v4
-; SI-NEXT:    v_or_b32_e32 v9, v10, v9
-; SI-NEXT:    v_or_b32_e32 v8, v8, v9
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 0x48, v0
-; SI-NEXT:    buffer_store_dword v8, v9, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v62
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v4
-; SI-NEXT:    v_or_b32_e32 v8, v8, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 24, v30
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; SI-NEXT:    v_or_b32_e32 v7, v9, v7
+; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x48, v0
+; SI-NEXT:    v_mov_b32_e32 v9, s16
+; SI-NEXT:    s_and_b32 s16, s17, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s48, 8
+; SI-NEXT:    v_and_b32_e32 v7, 0xff, v44
+; SI-NEXT:    buffer_store_dword v9, v8, s[0:3], 0 offen
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 24, v18
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
 ; SI-NEXT:    v_or_b32_e32 v7, v8, v7
+; SI-NEXT:    v_or_b32_e32 v7, s16, v7
+; SI-NEXT:    s_and_b32 s14, s14, 0xff
+; SI-NEXT:    s_lshl_b32 s16, s30, 8
+; SI-NEXT:    s_or_b32 s14, s14, s16
+; SI-NEXT:    s_and_b32 s16, s94, 0xff
+; SI-NEXT:    s_lshl_b32 s16, s16, 16
+; SI-NEXT:    s_lshl_b32 s17, s92, 24
+; SI-NEXT:    s_and_b32 s14, s14, 0xffff
+; SI-NEXT:    s_or_b32 s16, s17, s16
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x4c, v0
+; SI-NEXT:    s_or_b32 s14, s14, s16
 ; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v36
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 8, v4
-; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_or_b32_e32 v8, v9, v8
-; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x50, v0
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v25
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 8, v4
-; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 24, v3
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; SI-NEXT:    v_or_b32_e32 v6, v8, v6
+; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x50, v0
+; SI-NEXT:    v_mov_b32_e32 v8, s14
+; SI-NEXT:    s_and_b32 s14, s15, 0xff
+; SI-NEXT:    s_lshl_b32 s15, s67, 8
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; SI-NEXT:    buffer_store_dword v8, v7, s[0:3], 0 offen
+; SI-NEXT:    s_or_b32 s14, s14, s15
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v42
+; SI-NEXT:    s_and_b32 s14, s14, 0xffff
 ; SI-NEXT:    v_or_b32_e32 v6, v7, v6
+; SI-NEXT:    v_or_b32_e32 v6, s14, v6
+; SI-NEXT:    s_and_b32 s12, s12, 0xff
+; SI-NEXT:    s_lshl_b32 s14, s90, 8
+; SI-NEXT:    s_or_b32 s12, s12, s14
+; SI-NEXT:    s_and_b32 s14, s88, 0xff
+; SI-NEXT:    s_lshl_b32 s14, s14, 16
+; SI-NEXT:    s_lshl_b32 s15, s78, 24
+; SI-NEXT:    s_and_b32 s12, s12, 0xffff
+; SI-NEXT:    s_or_b32 s14, s15, s14
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x54, v0
+; SI-NEXT:    s_or_b32 s12, s12, s14
 ; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v35
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; SI-NEXT:    v_or_b32_e32 v6, v6, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_or_b32_e32 v7, v8, v7
-; SI-NEXT:    v_or_b32_e32 v6, v6, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x58, v0
-; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v23
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v4
-; SI-NEXT:    v_or_b32_e32 v6, v6, v7
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v3
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; SI-NEXT:    v_or_b32_e32 v5, v7, v5
+; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x58, v0
+; SI-NEXT:    v_mov_b32_e32 v7, s12
+; SI-NEXT:    s_and_b32 s12, s13, 0xff
+; SI-NEXT:    s_lshl_b32 s13, s65, 8
+; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; SI-NEXT:    buffer_store_dword v7, v6, s[0:3], 0 offen
+; SI-NEXT:    s_or_b32 s12, s12, s13
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v15
+; SI-NEXT:    s_and_b32 s12, s12, 0xffff
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
+; SI-NEXT:    v_or_b32_e32 v5, s12, v5
+; SI-NEXT:    s_and_b32 s10, s10, 0xff
+; SI-NEXT:    s_lshl_b32 s12, s76, 8
+; SI-NEXT:    s_or_b32 s10, s10, s12
+; SI-NEXT:    s_and_b32 s12, s74, 0xff
+; SI-NEXT:    s_lshl_b32 s12, s12, 16
+; SI-NEXT:    s_lshl_b32 s13, s72, 24
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_or_b32 s12, s13, s12
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x5c, v0
+; SI-NEXT:    s_or_b32 s10, s10, s12
 ; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v3
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_or_b32_e32 v6, v7, v6
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x60, v0
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v61
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v4
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v3
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x60, v0
+; SI-NEXT:    v_mov_b32_e32 v6, s10
+; SI-NEXT:    s_and_b32 s10, s11, 0xff
+; SI-NEXT:    s_lshl_b32 s11, s55, 8
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v55
+; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
+; SI-NEXT:    s_or_b32 s10, s10, s11
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; SI-NEXT:    v_or_b32_e32 v4, v6, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v45
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
 ; SI-NEXT:    v_or_b32_e32 v4, v5, v4
+; SI-NEXT:    v_or_b32_e32 v4, s10, v4
+; SI-NEXT:    s_and_b32 s8, s8, 0xff
+; SI-NEXT:    s_lshl_b32 s10, s60, 8
+; SI-NEXT:    s_or_b32 s8, s8, s10
+; SI-NEXT:    s_and_b32 s10, s56, 0xff
+; SI-NEXT:    s_lshl_b32 s10, s10, 16
+; SI-NEXT:    s_lshl_b32 s11, s46, 24
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    s_or_b32 s10, s11, s10
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x64, v0
+; SI-NEXT:    s_or_b32 s8, s8, s10
 ; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v40
-; SI-NEXT:    s_waitcnt vmcnt(1) expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; SI-NEXT:    v_or_b32_e32 v4, v4, v5
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    v_or_b32_e32 v4, v4, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x68, v0
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v56
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; SI-NEXT:    v_or_b32_e32 v4, v4, v5
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; SI-NEXT:    v_or_b32_e32 v2, v5, v2
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x68, v0
+; SI-NEXT:    v_mov_b32_e32 v5, s8
+; SI-NEXT:    s_and_b32 s8, s9, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s53, 8
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 offen
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v61
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
 ; SI-NEXT:    v_or_b32_e32 v2, v4, v2
+; SI-NEXT:    v_or_b32_e32 v2, s8, v2
+; SI-NEXT:    s_and_b32 s6, s6, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s86, 8
+; SI-NEXT:    s_or_b32 s6, s6, s8
+; SI-NEXT:    s_and_b32 s8, s84, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_lshl_b32 s9, s82, 24
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_or_b32 s8, s9, s8
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x6c, v0
+; SI-NEXT:    s_or_b32 s6, s6, s8
 ; SI-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v53
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v38
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_or_b32_e32 v2, v2, v4
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v39
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_or_b32_e32 v4, v5, v4
-; SI-NEXT:    v_or_b32_e32 v2, v2, v4
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x70, v0
-; SI-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v54
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; SI-NEXT:    v_or_b32_e32 v2, v2, v4
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT:    v_or_b32_e32 v1, v4, v1
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
+; SI-NEXT:    v_mov_b32_e32 v4, s6
+; SI-NEXT:    s_and_b32 s6, s7, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s51, 8
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 24, v47
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    v_or_b32_e32 v1, s6, v1
+; SI-NEXT:    v_readlane_b32 s6, v62, 46
+; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    s_lshl_b32 s6, s6, 8
+; SI-NEXT:    v_readlane_b32 s7, v62, 47
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    s_and_b32 s6, s80, 0xff
+; SI-NEXT:    v_readlane_b32 s8, v62, 48
+; SI-NEXT:    s_lshl_b32 s6, s6, 16
+; SI-NEXT:    s_lshl_b32 s7, s8, 24
+; SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; SI-NEXT:    s_or_b32 s6, s7, s6
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
+; SI-NEXT:    s_or_b32 s4, s4, s6
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v42
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v50
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v27
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v2, v4, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x78, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_and_b32 s4, s5, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s49, 8
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v53
+; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v12
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 24, v33
+; SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    v_or_b32_e32 v1, s4, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    v_readlane_b32 s45, v62, 17
+; SI-NEXT:    v_readlane_b32 s43, v62, 23
+; SI-NEXT:    v_readlane_b32 s41, v62, 29
+; SI-NEXT:    v_readlane_b32 s29, v62, 35
+; SI-NEXT:    v_readlane_b32 s27, v62, 41
+; SI-NEXT:    v_readlane_b32 s25, v62, 45
+; SI-NEXT:    v_readlane_b32 s9, v62, 49
+; SI-NEXT:    v_readlane_b32 s99, v63, 35
+; SI-NEXT:    v_readlane_b32 s98, v63, 34
+; SI-NEXT:    v_readlane_b32 s97, v63, 33
+; SI-NEXT:    v_readlane_b32 s96, v63, 32
+; SI-NEXT:    v_readlane_b32 s87, v63, 31
+; SI-NEXT:    v_readlane_b32 s86, v63, 30
+; SI-NEXT:    v_readlane_b32 s85, v63, 29
+; SI-NEXT:    v_readlane_b32 s84, v63, 28
+; SI-NEXT:    v_readlane_b32 s83, v63, 27
+; SI-NEXT:    v_readlane_b32 s82, v63, 26
+; SI-NEXT:    v_readlane_b32 s81, v63, 25
+; SI-NEXT:    v_readlane_b32 s80, v63, 24
+; SI-NEXT:    v_readlane_b32 s71, v63, 23
+; SI-NEXT:    v_readlane_b32 s70, v63, 22
+; SI-NEXT:    v_readlane_b32 s69, v63, 21
+; SI-NEXT:    v_readlane_b32 s68, v63, 20
+; SI-NEXT:    v_readlane_b32 s67, v63, 19
+; SI-NEXT:    v_readlane_b32 s66, v63, 18
+; SI-NEXT:    v_readlane_b32 s65, v63, 17
+; SI-NEXT:    v_readlane_b32 s64, v63, 16
+; SI-NEXT:    v_readlane_b32 s55, v63, 15
+; SI-NEXT:    v_readlane_b32 s54, v63, 14
+; SI-NEXT:    v_readlane_b32 s53, v63, 13
+; SI-NEXT:    v_readlane_b32 s52, v63, 12
+; SI-NEXT:    v_readlane_b32 s51, v63, 11
+; SI-NEXT:    v_readlane_b32 s50, v63, 10
+; SI-NEXT:    v_readlane_b32 s49, v63, 9
+; SI-NEXT:    v_readlane_b32 s48, v63, 8
+; SI-NEXT:    v_readlane_b32 s39, v63, 7
+; SI-NEXT:    v_readlane_b32 s38, v63, 6
+; SI-NEXT:    v_readlane_b32 s37, v63, 5
+; SI-NEXT:    v_readlane_b32 s36, v63, 4
+; SI-NEXT:    v_readlane_b32 s35, v63, 3
+; SI-NEXT:    v_readlane_b32 s34, v63, 2
+; SI-NEXT:    v_readlane_b32 s31, v63, 1
+; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -198090,1615 +199680,1736 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:332
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:328
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:324
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:320
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:308
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:304
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:300
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:296
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:292
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:288
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:276
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:272
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:268
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:264
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:260
-; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:256
-; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:244
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:240
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:332
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:328
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:324
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:320
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:316
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:312
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:308
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:304
+; SI-NEXT:    ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
 ; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:236
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:232
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:228
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:224
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:212
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:208
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:204
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:200
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:196
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
-; SI-NEXT:    ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v63, s38, 6
-; SI-NEXT:    v_writelane_b32 v62, s28, 0
-; SI-NEXT:    v_writelane_b32 v63, s39, 7
-; SI-NEXT:    v_writelane_b32 v62, s25, 1
-; SI-NEXT:    v_writelane_b32 v63, s48, 8
-; SI-NEXT:    v_writelane_b32 v62, s24, 2
-; SI-NEXT:    v_writelane_b32 v63, s49, 9
-; SI-NEXT:    v_writelane_b32 v62, s23, 3
-; SI-NEXT:    v_writelane_b32 v63, s50, 10
-; SI-NEXT:    v_writelane_b32 v62, s22, 4
-; SI-NEXT:    v_writelane_b32 v63, s51, 11
-; SI-NEXT:    v_writelane_b32 v62, s21, 5
-; SI-NEXT:    v_writelane_b32 v63, s52, 12
-; SI-NEXT:    v_writelane_b32 v62, s18, 6
-; SI-NEXT:    v_writelane_b32 v63, s53, 13
-; SI-NEXT:    v_writelane_b32 v62, s16, 7
-; SI-NEXT:    v_writelane_b32 v63, s54, 14
-; SI-NEXT:    v_writelane_b32 v63, s55, 15
-; SI-NEXT:    v_writelane_b32 v63, s64, 16
-; SI-NEXT:    v_writelane_b32 v63, s65, 17
-; SI-NEXT:    v_writelane_b32 v63, s66, 18
-; SI-NEXT:    v_writelane_b32 v63, s67, 19
-; SI-NEXT:    v_writelane_b32 v63, s68, 20
-; SI-NEXT:    v_writelane_b32 v63, s69, 21
-; SI-NEXT:    v_writelane_b32 v63, s70, 22
-; SI-NEXT:    v_writelane_b32 v63, s71, 23
-; SI-NEXT:    v_writelane_b32 v63, s80, 24
-; SI-NEXT:    v_writelane_b32 v63, s81, 25
-; SI-NEXT:    v_writelane_b32 v63, s82, 26
-; SI-NEXT:    v_writelane_b32 v63, s83, 27
-; SI-NEXT:    v_writelane_b32 v63, s84, 28
-; SI-NEXT:    v_writelane_b32 v63, s85, 29
-; SI-NEXT:    v_writelane_b32 v63, s86, 30
-; SI-NEXT:    v_writelane_b32 v63, s87, 31
-; SI-NEXT:    v_writelane_b32 v63, s96, 32
-; SI-NEXT:    v_writelane_b32 v63, s97, 33
-; SI-NEXT:    v_writelane_b32 v63, s98, 34
-; SI-NEXT:    v_readfirstlane_b32 s15, v16
-; SI-NEXT:    v_readfirstlane_b32 s18, v25
-; SI-NEXT:    v_readfirstlane_b32 s43, v15
-; SI-NEXT:    v_readfirstlane_b32 s42, v24
-; SI-NEXT:    v_readfirstlane_b32 s44, v23
-; SI-NEXT:    v_readfirstlane_b32 s49, v12
-; SI-NEXT:    v_readfirstlane_b32 s8, v11
-; SI-NEXT:    v_readfirstlane_b32 s53, v20
-; SI-NEXT:    v_readfirstlane_b32 s40, v19
-; SI-NEXT:    v_readfirstlane_b32 s23, v28
-; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_writelane_b32 v41, s30, 0
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_writelane_b32 v43, s29, 0
+; SI-NEXT:    v_writelane_b32 v43, s28, 1
+; SI-NEXT:    v_writelane_b32 v43, s27, 2
+; SI-NEXT:    v_writelane_b32 v43, s26, 3
+; SI-NEXT:    v_writelane_b32 v43, s25, 4
+; SI-NEXT:    v_writelane_b32 v43, s24, 5
+; SI-NEXT:    v_writelane_b32 v43, s23, 6
+; SI-NEXT:    v_writelane_b32 v43, s22, 7
+; SI-NEXT:    v_writelane_b32 v43, s21, 8
+; SI-NEXT:    v_writelane_b32 v43, s20, 9
+; SI-NEXT:    v_writelane_b32 v43, s19, 10
+; SI-NEXT:    v_writelane_b32 v43, s18, 11
+; SI-NEXT:    v_writelane_b32 v43, s17, 12
+; SI-NEXT:    v_writelane_b32 v43, s16, 13
+; SI-NEXT:    v_writelane_b32 v41, s31, 1
+; SI-NEXT:    v_writelane_b32 v41, s34, 2
+; SI-NEXT:    v_writelane_b32 v41, s35, 3
+; SI-NEXT:    v_writelane_b32 v41, s36, 4
+; SI-NEXT:    v_writelane_b32 v41, s37, 5
+; SI-NEXT:    v_writelane_b32 v41, s38, 6
+; SI-NEXT:    v_writelane_b32 v41, s39, 7
+; SI-NEXT:    v_writelane_b32 v41, s48, 8
+; SI-NEXT:    v_writelane_b32 v41, s49, 9
+; SI-NEXT:    v_writelane_b32 v41, s50, 10
+; SI-NEXT:    v_writelane_b32 v41, s51, 11
+; SI-NEXT:    v_writelane_b32 v41, s52, 12
+; SI-NEXT:    v_writelane_b32 v41, s53, 13
+; SI-NEXT:    v_writelane_b32 v41, s54, 14
+; SI-NEXT:    v_writelane_b32 v41, s55, 15
+; SI-NEXT:    v_writelane_b32 v41, s64, 16
+; SI-NEXT:    v_writelane_b32 v41, s65, 17
+; SI-NEXT:    v_writelane_b32 v41, s66, 18
+; SI-NEXT:    v_writelane_b32 v41, s67, 19
+; SI-NEXT:    v_writelane_b32 v41, s68, 20
+; SI-NEXT:    v_writelane_b32 v41, s69, 21
+; SI-NEXT:    v_writelane_b32 v41, s70, 22
+; SI-NEXT:    v_writelane_b32 v41, s71, 23
+; SI-NEXT:    v_writelane_b32 v41, s80, 24
+; SI-NEXT:    v_writelane_b32 v41, s81, 25
+; SI-NEXT:    v_writelane_b32 v41, s82, 26
+; SI-NEXT:    v_writelane_b32 v41, s83, 27
+; SI-NEXT:    v_writelane_b32 v41, s84, 28
+; SI-NEXT:    v_writelane_b32 v41, s85, 29
+; SI-NEXT:    v_writelane_b32 v41, s86, 30
+; SI-NEXT:    v_writelane_b32 v41, s87, 31
+; SI-NEXT:    v_writelane_b32 v41, s96, 32
+; SI-NEXT:    v_writelane_b32 v41, s97, 33
+; SI-NEXT:    v_writelane_b32 v41, s98, 34
+; SI-NEXT:    v_readfirstlane_b32 s39, v26
+; SI-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
+; SI-NEXT:    v_readfirstlane_b32 s47, v12
+; SI-NEXT:    v_writelane_b32 v42, s39, 0
+; SI-NEXT:    v_readfirstlane_b32 s56, v11
+; SI-NEXT:    v_writelane_b32 v42, s47, 1
+; SI-NEXT:    v_readfirstlane_b32 s48, v24
+; SI-NEXT:    v_writelane_b32 v42, s56, 2
+; SI-NEXT:    v_readfirstlane_b32 s49, v23
+; SI-NEXT:    v_writelane_b32 v42, s48, 3
+; SI-NEXT:    v_readfirstlane_b32 s50, v21
+; SI-NEXT:    v_writelane_b32 v42, s49, 4
+; SI-NEXT:    v_readfirstlane_b32 s51, v22
+; SI-NEXT:    v_writelane_b32 v42, s50, 5
+; SI-NEXT:    v_writelane_b32 v42, s51, 6
+; SI-NEXT:    v_readfirstlane_b32 s57, v20
+; SI-NEXT:    v_readfirstlane_b32 s58, v19
+; SI-NEXT:    v_readfirstlane_b32 s64, v29
+; SI-NEXT:    v_readfirstlane_b32 s65, v30
+; SI-NEXT:    v_readfirstlane_b32 s59, v28
+; SI-NEXT:    v_readfirstlane_b32 s60, v27
+; SI-NEXT:    v_readfirstlane_b32 s11, v1
+; SI-NEXT:    v_readfirstlane_b32 s12, v2
+; SI-NEXT:    v_readfirstlane_b32 s13, v9
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_readfirstlane_b32 s4, v31
+; SI-NEXT:    v_writelane_b32 v43, s4, 14
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:300
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:296
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:292
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:288
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:284
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:280
+; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_readfirstlane_b32 s4, v32
+; SI-NEXT:    v_writelane_b32 v43, s4, 15
+; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    v_readfirstlane_b32 s4, v33
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:276
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:272
+; SI-NEXT:    v_writelane_b32 v43, s4, 16
+; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v34
-; SI-NEXT:    v_writelane_b32 v62, s4, 8
-; SI-NEXT:    v_readfirstlane_b32 s4, v38
-; SI-NEXT:    v_writelane_b32 v62, s4, 9
+; SI-NEXT:    v_writelane_b32 v43, s4, 17
+; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    v_readfirstlane_b32 s4, v35
+; SI-NEXT:    v_writelane_b32 v43, s4, 18
+; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    v_readfirstlane_b32 s44, v36
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_readfirstlane_b32 s90, v37
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:268
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:264
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:260
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:256
+; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_readfirstlane_b32 s6, v38
+; SI-NEXT:    v_readfirstlane_b32 s14, v10
+; SI-NEXT:    v_readfirstlane_b32 s15, v8
+; SI-NEXT:    v_readfirstlane_b32 s18, v7
+; SI-NEXT:    v_readfirstlane_b32 s21, v5
+; SI-NEXT:    v_readfirstlane_b32 s22, v6
+; SI-NEXT:    v_readfirstlane_b32 s40, v17
+; SI-NEXT:    v_readfirstlane_b32 s41, v18
+; SI-NEXT:    v_readfirstlane_b32 s42, v4
+; SI-NEXT:    v_readfirstlane_b32 s43, v3
+; SI-NEXT:    v_readfirstlane_b32 s76, v16
+; SI-NEXT:    v_readfirstlane_b32 s77, v15
+; SI-NEXT:    v_readfirstlane_b32 s38, v25
+; SI-NEXT:    v_writelane_b32 v41, s99, 35
+; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    v_readfirstlane_b32 s4, v31
+; SI-NEXT:    v_writelane_b32 v43, s4, 19
+; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    v_readfirstlane_b32 s4, v39
+; SI-NEXT:    v_writelane_b32 v43, s4, 20
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_readfirstlane_b32 s4, v48
+; SI-NEXT:    v_writelane_b32 v43, s4, 21
+; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v49
-; SI-NEXT:    v_writelane_b32 v62, s4, 10
+; SI-NEXT:    v_writelane_b32 v43, s4, 22
+; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v50
-; SI-NEXT:    v_writelane_b32 v62, s4, 11
-; SI-NEXT:    v_readfirstlane_b32 s79, v52
-; SI-NEXT:    v_readfirstlane_b32 s88, v54
-; SI-NEXT:    v_readfirstlane_b32 s4, v55
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:192
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:180
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:176
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:172
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:168
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:164
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:160
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:148
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:144
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:140
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:136
-; SI-NEXT:    v_writelane_b32 v62, s4, 12
-; SI-NEXT:    v_readfirstlane_b32 s77, v41
-; SI-NEXT:    v_readfirstlane_b32 s4, v42
-; SI-NEXT:    v_readfirstlane_b32 s94, v31
-; SI-NEXT:    v_readfirstlane_b32 s70, v32
-; SI-NEXT:    v_readfirstlane_b32 s51, v33
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_readfirstlane_b32 s37, v45
-; SI-NEXT:    v_readfirstlane_b32 s24, v56
-; SI-NEXT:    v_readfirstlane_b32 s7, v57
-; SI-NEXT:    v_readfirstlane_b32 s92, v58
-; SI-NEXT:    v_readfirstlane_b32 s28, v59
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:132
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:116
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:112
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:108
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:104
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:100
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:96
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:84
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:80
-; SI-NEXT:    v_readfirstlane_b32 s68, v35
-; SI-NEXT:    v_readfirstlane_b32 s21, v36
-; SI-NEXT:    v_readfirstlane_b32 s87, v37
-; SI-NEXT:    v_readfirstlane_b32 s67, v39
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:64
-; SI-NEXT:    v_readfirstlane_b32 s35, v43
-; SI-NEXT:    v_readfirstlane_b32 s55, v46
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_readfirstlane_b32 s74, v53
-; SI-NEXT:    v_readfirstlane_b32 s85, v48
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:40
-; SI-NEXT:    v_writelane_b32 v62, s4, 13
-; SI-NEXT:    v_readfirstlane_b32 s98, v40
-; SI-NEXT:    v_readfirstlane_b32 s69, v51
-; SI-NEXT:    v_readfirstlane_b32 s34, v27
-; SI-NEXT:    v_writelane_b32 v63, s99, 35
-; SI-NEXT:    v_readfirstlane_b32 s97, v26
-; SI-NEXT:    v_readfirstlane_b32 s80, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 24, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 24, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 24, v30
-; SI-NEXT:    v_readfirstlane_b32 s96, v17
-; SI-NEXT:    v_readfirstlane_b32 s64, v9
-; SI-NEXT:    v_readfirstlane_b32 s25, v8
-; SI-NEXT:    v_readfirstlane_b32 s83, v7
-; SI-NEXT:    v_readfirstlane_b32 s84, v4
-; SI-NEXT:    v_readfirstlane_b32 s93, v3
-; SI-NEXT:    v_readfirstlane_b32 s76, v1
-; SI-NEXT:    v_readfirstlane_b32 s58, v38
-; SI-NEXT:    v_readfirstlane_b32 s65, v49
-; SI-NEXT:    v_readfirstlane_b32 s62, v54
-; SI-NEXT:    v_readfirstlane_b32 s81, v44
-; SI-NEXT:    v_readfirstlane_b32 s71, v47
-; SI-NEXT:    v_readfirstlane_b32 s38, v60
-; SI-NEXT:    v_readfirstlane_b32 s86, v61
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:156
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:220
-; SI-NEXT:    v_readfirstlane_b32 s90, v50
-; SI-NEXT:    v_readfirstlane_b32 s31, v52
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_readfirstlane_b32 s4, v55
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:336
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32
-; SI-NEXT:    v_readfirstlane_b32 s72, v31
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:316
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:72
-; SI-NEXT:    v_readfirstlane_b32 s82, v56
-; SI-NEXT:    v_readfirstlane_b32 s95, v57
+; SI-NEXT:    v_writelane_b32 v43, s4, 23
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_readfirstlane_b32 s4, v51
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:252
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:248
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:244
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:240
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:236
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:232
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:228
+; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_readfirstlane_b32 s91, v32
+; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    v_readfirstlane_b32 s8, v33
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:224
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:220
+; SI-NEXT:    v_writelane_b32 v43, s4, 24
+; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_readfirstlane_b32 s4, v34
+; SI-NEXT:    v_writelane_b32 v43, s4, 25
+; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    v_readfirstlane_b32 s4, v35
+; SI-NEXT:    v_writelane_b32 v43, s4, 26
+; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    v_readfirstlane_b32 s4, v36
+; SI-NEXT:    v_writelane_b32 v43, s4, 27
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_readfirstlane_b32 s4, v37
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:216
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:212
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:208
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:204
+; SI-NEXT:    v_writelane_b32 v43, s4, 28
+; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_readfirstlane_b32 s4, v31
+; SI-NEXT:    v_writelane_b32 v43, s4, 29
+; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    v_readfirstlane_b32 s89, v38
+; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    v_readfirstlane_b32 s78, v39
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_readfirstlane_b32 s7, v48
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_readfirstlane_b32 s82, v49
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_readfirstlane_b32 s4, v50
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_readfirstlane_b32 s96, v51
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:200
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:196
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:192
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:188
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:184
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:180
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:176
+; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    v_readfirstlane_b32 s70, v33
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:172
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:168
+; SI-NEXT:    v_writelane_b32 v43, s4, 30
+; SI-NEXT:    v_readfirstlane_b32 s4, v32
+; SI-NEXT:    v_writelane_b32 v43, s4, 31
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:164
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:160
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:156
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:152
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_readfirstlane_b32 s39, v58
-; SI-NEXT:    v_readfirstlane_b32 s56, v59
-; SI-NEXT:    v_readfirstlane_b32 s57, v41
-; SI-NEXT:    v_readfirstlane_b32 s36, v42
-; SI-NEXT:    v_readfirstlane_b32 s73, v45
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:284
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:252
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:188
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:124
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:92
-; SI-NEXT:    v_readfirstlane_b32 s16, v34
-; SI-NEXT:    v_readfirstlane_b32 s48, v32
-; SI-NEXT:    v_readfirstlane_b32 s52, v33
-; SI-NEXT:    v_readfirstlane_b32 s47, v35
-; SI-NEXT:    v_readfirstlane_b32 s60, v37
-; SI-NEXT:    v_readfirstlane_b32 s61, v39
-; SI-NEXT:    v_writelane_b32 v62, s4, 14
-; SI-NEXT:    v_readfirstlane_b32 s54, v48
-; SI-NEXT:    v_readfirstlane_b32 s89, v43
-; SI-NEXT:    v_readfirstlane_b32 s50, v53
+; SI-NEXT:    v_readfirstlane_b32 s4, v34
+; SI-NEXT:    v_writelane_b32 v43, s4, 32
+; SI-NEXT:    v_readfirstlane_b32 s9, v35
+; SI-NEXT:    s_waitcnt vmcnt(13)
+; SI-NEXT:    v_readfirstlane_b32 s4, v37
+; SI-NEXT:    v_writelane_b32 v43, s4, 33
+; SI-NEXT:    v_readfirstlane_b32 s10, v36
+; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_readfirstlane_b32 s4, v31
+; SI-NEXT:    v_writelane_b32 v43, s4, 34
+; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    v_readfirstlane_b32 s4, v38
+; SI-NEXT:    v_writelane_b32 v43, s4, 35
+; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    v_readfirstlane_b32 s4, v39
+; SI-NEXT:    v_writelane_b32 v43, s4, 36
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_readfirstlane_b32 s69, v48
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_readfirstlane_b32 s30, v49
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_readfirstlane_b32 s16, v50
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_readfirstlane_b32 s36, v51
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:148
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:144
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_readfirstlane_b32 s4, v33
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:140
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:136
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:132
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:128
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:124
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:120
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:116
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:336
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:112
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:108
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:104
+; SI-NEXT:    v_writelane_b32 v43, s4, 37
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_readfirstlane_b32 s99, v46
-; SI-NEXT:    v_readfirstlane_b32 s78, v49
-; SI-NEXT:    v_readfirstlane_b32 s30, v51
-; SI-NEXT:    v_readfirstlane_b32 s66, v54
-; SI-NEXT:    v_readfirstlane_b32 s91, v40
+; SI-NEXT:    v_readfirstlane_b32 s4, v52
+; SI-NEXT:    v_writelane_b32 v43, s4, 38
+; SI-NEXT:    v_readfirstlane_b32 s4, v53
+; SI-NEXT:    v_writelane_b32 v43, s4, 39
+; SI-NEXT:    v_readfirstlane_b32 s4, v54
+; SI-NEXT:    v_writelane_b32 v43, s4, 40
+; SI-NEXT:    v_writelane_b32 v43, s44, 41
+; SI-NEXT:    v_writelane_b32 v43, s6, 42
+; SI-NEXT:    v_writelane_b32 v43, s7, 43
+; SI-NEXT:    v_writelane_b32 v43, s8, 44
+; SI-NEXT:    v_writelane_b32 v43, s9, 45
+; SI-NEXT:    v_writelane_b32 v43, s10, 46
+; SI-NEXT:    v_writelane_b32 v43, s11, 47
+; SI-NEXT:    v_writelane_b32 v43, s12, 48
+; SI-NEXT:    v_writelane_b32 v43, s13, 49
+; SI-NEXT:    v_writelane_b32 v43, s14, 50
+; SI-NEXT:    v_writelane_b32 v43, s15, 51
+; SI-NEXT:    v_writelane_b32 v43, s18, 52
+; SI-NEXT:    v_writelane_b32 v43, s21, 53
+; SI-NEXT:    v_writelane_b32 v43, s22, 54
+; SI-NEXT:    v_writelane_b32 v43, s40, 55
+; SI-NEXT:    v_writelane_b32 v43, s41, 56
+; SI-NEXT:    v_writelane_b32 v43, s42, 57
+; SI-NEXT:    v_writelane_b32 v43, s43, 58
+; SI-NEXT:    v_writelane_b32 v43, s76, 59
+; SI-NEXT:    v_writelane_b32 v43, s77, 60
+; SI-NEXT:    v_readfirstlane_b32 s93, v55
 ; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_readfirstlane_b32 s6, v44
+; SI-NEXT:    v_readfirstlane_b32 s95, v40
+; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    v_readfirstlane_b32 s17, v33
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_readfirstlane_b32 s98, v34
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_readfirstlane_b32 s23, v35
+; SI-NEXT:    v_readfirstlane_b32 s25, v31
+; SI-NEXT:    v_readfirstlane_b32 s28, v32
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_readfirstlane_b32 s26, v36
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_readfirstlane_b32 s88, v37
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_readfirstlane_b32 s79, v38
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_readfirstlane_b32 s75, v39
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:100
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:88
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:84
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:80
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:72
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:68
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v50
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v48
+; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    v_readfirstlane_b32 s24, v49
+; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    v_readfirstlane_b32 s85, v50
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_readfirstlane_b32 s66, v51
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:56
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:52
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_readfirstlane_b32 s4, v10
-; SI-NEXT:    v_writelane_b32 v62, s4, 15
-; SI-NEXT:    v_readfirstlane_b32 s4, v2
-; SI-NEXT:    v_writelane_b32 v62, s4, 16
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_readfirstlane_b32 s22, v36
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:312
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:280
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:248
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:216
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:184
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:152
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:120
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:88
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24
-; SI-NEXT:    v_writelane_b32 v62, s17, 17
-; SI-NEXT:    v_writelane_b32 v62, s15, 18
-; SI-NEXT:    v_writelane_b32 v62, s18, 19
-; SI-NEXT:    v_writelane_b32 v62, s43, 20
-; SI-NEXT:    v_writelane_b32 v62, s42, 21
-; SI-NEXT:    v_writelane_b32 v62, s44, 22
-; SI-NEXT:    v_writelane_b32 v62, s16, 23
-; SI-NEXT:    v_writelane_b32 v62, s49, 24
-; SI-NEXT:    v_writelane_b32 v62, s8, 25
-; SI-NEXT:    v_writelane_b32 v62, s6, 26
-; SI-NEXT:    v_readfirstlane_b32 s45, v52
-; SI-NEXT:    v_writelane_b32 v62, s56, 27
-; SI-NEXT:    v_writelane_b32 v62, s45, 28
-; SI-NEXT:    v_writelane_b32 v62, s53, 29
-; SI-NEXT:    v_writelane_b32 v62, s94, 30
-; SI-NEXT:    v_writelane_b32 v62, s57, 31
-; SI-NEXT:    v_writelane_b32 v62, s58, 32
-; SI-NEXT:    v_writelane_b32 v62, s47, 33
-; SI-NEXT:    v_readfirstlane_b32 s46, v55
-; SI-NEXT:    v_writelane_b32 v62, s40, 34
-; SI-NEXT:    v_readfirstlane_b32 s59, v47
-; SI-NEXT:    v_writelane_b32 v62, s46, 35
-; SI-NEXT:    v_writelane_b32 v62, s59, 36
-; SI-NEXT:    v_writelane_b32 v62, s60, 37
-; SI-NEXT:    v_writelane_b32 v62, s36, 38
-; SI-NEXT:    v_writelane_b32 v62, s65, 39
-; SI-NEXT:    v_writelane_b32 v62, s61, 40
-; SI-NEXT:    v_writelane_b32 v62, s73, 41
-; SI-NEXT:    v_writelane_b32 v62, s62, 42
-; SI-NEXT:    v_writelane_b32 v62, s72, 43
-; SI-NEXT:    v_writelane_b32 v62, s23, 44
-; SI-NEXT:    v_writelane_b32 v62, s48, 45
-; SI-NEXT:    v_writelane_b32 v62, s34, 46
-; SI-NEXT:    v_writelane_b32 v62, s78, 47
-; SI-NEXT:    v_writelane_b32 v62, s30, 48
-; SI-NEXT:    v_writelane_b32 v62, s54, 49
-; SI-NEXT:    v_writelane_b32 v62, s50, 50
-; SI-NEXT:    v_writelane_b32 v62, s52, 51
-; SI-NEXT:    v_writelane_b32 v62, s82, 52
-; SI-NEXT:    v_writelane_b32 v62, s66, 53
+; SI-NEXT:    v_readfirstlane_b32 vcc_lo, v13
+; SI-NEXT:    v_readfirstlane_b32 vcc_hi, v14
+; SI-NEXT:    v_writelane_b32 v43, vcc_lo, 61
+; SI-NEXT:    v_writelane_b32 v43, vcc_hi, 62
+; SI-NEXT:    v_writelane_b32 v43, s38, 63
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 24, v57
+; SI-NEXT:    v_readfirstlane_b32 s20, v31
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 24, v58
+; SI-NEXT:    v_readfirstlane_b32 s19, v32
 ; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 24, v59
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 24, v56
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 24, v60
-; SI-NEXT:    v_lshlrev_b32_e32 v26, 24, v45
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 24, v61
-; SI-NEXT:    v_lshlrev_b32_e32 v46, 24, v42
-; SI-NEXT:    v_lshlrev_b32_e32 v30, 24, v41
-; SI-NEXT:    v_lshlrev_b32_e32 v44, 24, v31
-; SI-NEXT:    v_writelane_b32 v62, s91, 54
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT:    v_readfirstlane_b32 s27, v33
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_readfirstlane_b32 s94, v34
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_readfirstlane_b32 s72, v35
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_readfirstlane_b32 s73, v36
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_readfirstlane_b32 s67, v37
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_readfirstlane_b32 s71, v38
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_readfirstlane_b32 s97, v39
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:20
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; SI-NEXT:    s_cbranch_scc0 .LBB97_4
-; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; SI-NEXT:    v_readlane_b32 s5, v62, 5
-; SI-NEXT:    s_and_b32 s4, s20, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_writelane_b32 v62, s4, 55
-; SI-NEXT:    v_readlane_b32 s4, v62, 4
-; SI-NEXT:    s_and_b32 s4, s4, 0xff
-; SI-NEXT:    v_readlane_b32 s5, v62, 3
-; SI-NEXT:    s_lshl_b32 s4, s4, 16
-; SI-NEXT:    s_lshl_b32 s5, s5, 24
-; SI-NEXT:    s_or_b32 s63, s5, s4
-; SI-NEXT:    v_readlane_b32 s4, v62, 6
-; SI-NEXT:    s_and_b32 s5, s4, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s9, s19, 24
-; SI-NEXT:    v_readlane_b32 s4, v62, 0
-; SI-NEXT:    s_or_b32 s9, s9, s5
-; SI-NEXT:    s_and_b32 s5, s4, 0xff
-; SI-NEXT:    s_lshl_b32 s10, s29, 8
-; SI-NEXT:    s_or_b32 s4, s5, s10
-; SI-NEXT:    v_writelane_b32 v62, s4, 56
-; SI-NEXT:    s_and_b32 s5, s76, 0xff
-; SI-NEXT:    v_readlane_b32 s10, v62, 16
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s11, s10, 24
-; SI-NEXT:    s_or_b32 s5, s11, s5
-; SI-NEXT:    s_and_b32 s11, s26, 0xff
-; SI-NEXT:    s_lshl_b32 s11, s11, 16
-; SI-NEXT:    s_lshl_b32 s12, s27, 24
-; SI-NEXT:    s_or_b32 s14, s12, s11
-; SI-NEXT:    s_and_b32 s11, s83, 0xff
-; SI-NEXT:    s_lshl_b32 s12, s25, 8
-; SI-NEXT:    s_or_b32 s10, s11, s12
-; SI-NEXT:    v_writelane_b32 v62, s10, 57
-; SI-NEXT:    s_and_b32 s11, s64, 0xff
-; SI-NEXT:    v_readlane_b32 s10, v62, 15
-; SI-NEXT:    s_lshl_b32 s11, s11, 16
-; SI-NEXT:    s_lshl_b32 s13, s10, 24
-; SI-NEXT:    s_or_b32 s41, s13, s11
-; SI-NEXT:    s_and_b32 s11, s43, 0xff
-; SI-NEXT:    s_lshl_b32 s13, s15, 8
-; SI-NEXT:    s_or_b32 s10, s11, s13
-; SI-NEXT:    s_and_b32 s11, s96, 0xff
-; SI-NEXT:    s_lshl_b32 s11, s11, 16
-; SI-NEXT:    s_lshl_b32 s15, s80, 24
-; SI-NEXT:    s_or_b32 s43, s15, s11
-; SI-NEXT:    s_and_b32 s11, s44, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s42, 8
-; SI-NEXT:    s_or_b32 s13, s11, s15
-; SI-NEXT:    s_and_b32 s11, s18, 0xff
-; SI-NEXT:    s_lshl_b32 s11, s11, 16
-; SI-NEXT:    s_lshl_b32 s15, s97, 24
-; SI-NEXT:    s_or_b32 s44, s15, s11
-; SI-NEXT:    s_and_b32 s11, s59, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s46, 8
-; SI-NEXT:    s_or_b32 s12, s11, s15
-; SI-NEXT:    s_and_b32 s11, s45, 0xff
-; SI-NEXT:    s_lshl_b32 s11, s11, 16
-; SI-NEXT:    s_lshl_b32 s15, s6, 24
-; SI-NEXT:    s_or_b32 s45, s15, s11
-; SI-NEXT:    s_and_b32 s11, s30, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s78, 8
-; SI-NEXT:    v_writelane_b32 v62, s10, 58
-; SI-NEXT:    s_or_b32 s10, s11, s15
-; SI-NEXT:    s_and_b32 s11, s99, 0xff
-; SI-NEXT:    s_lshl_b32 s11, s11, 16
-; SI-NEXT:    s_lshl_b32 s15, s89, 24
-; SI-NEXT:    s_or_b32 s46, s15, s11
-; SI-NEXT:    s_and_b32 s11, s61, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s60, 8
-; SI-NEXT:    s_or_b32 s6, s11, s15
-; SI-NEXT:    s_and_b32 s11, s22, 0xff
-; SI-NEXT:    s_lshl_b32 s11, s11, 16
-; SI-NEXT:    s_lshl_b32 s15, s47, 24
-; SI-NEXT:    s_or_b32 s47, s15, s11
-; SI-NEXT:    s_and_b32 s11, s57, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s56, 8
-; SI-NEXT:    v_writelane_b32 v62, s6, 59
-; SI-NEXT:    s_or_b32 s6, s11, s15
-; SI-NEXT:    s_and_b32 s11, s39, 0xff
-; SI-NEXT:    v_writelane_b32 v62, s6, 60
-; SI-NEXT:    s_lshl_b32 s11, s11, 16
-; SI-NEXT:    s_lshl_b32 s15, s95, 24
-; SI-NEXT:    s_or_b32 s56, s15, s11
-; SI-NEXT:    s_and_b32 s11, s48, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s72, 8
-; SI-NEXT:    v_readlane_b32 s6, v62, 14
-; SI-NEXT:    s_or_b32 s48, s11, s15
-; SI-NEXT:    s_and_b32 s11, s6, 0xff
-; SI-NEXT:    s_lshl_b32 s11, s11, 16
-; SI-NEXT:    s_lshl_b32 s15, s31, 24
-; SI-NEXT:    s_or_b32 vcc_lo, s15, s11
-; SI-NEXT:    s_and_b32 s11, s86, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s38, 8
-; SI-NEXT:    s_or_b32 s72, s11, s15
-; SI-NEXT:    s_and_b32 s11, s71, 0xff
-; SI-NEXT:    s_lshl_b32 s11, s11, 16
-; SI-NEXT:    s_lshl_b32 s15, s81, 24
-; SI-NEXT:    s_or_b32 vcc_hi, s15, s11
-; SI-NEXT:    s_and_b32 s11, s58, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s85, 8
-; SI-NEXT:    s_or_b32 s57, s11, s15
-; SI-NEXT:    s_and_b32 s11, s69, 0xff
-; SI-NEXT:    s_lshl_b32 s11, s11, 16
-; SI-NEXT:    s_lshl_b32 s15, s74, 24
-; SI-NEXT:    v_writelane_b32 v62, s74, 61
-; SI-NEXT:    s_or_b32 s74, s15, s11
-; SI-NEXT:    s_and_b32 s11, s87, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s21, 8
-; SI-NEXT:    s_or_b32 s58, s11, s15
-; SI-NEXT:    s_and_b32 s11, s68, 0xff
-; SI-NEXT:    s_lshl_b32 s11, s11, 16
-; SI-NEXT:    s_lshl_b32 s15, s28, 24
-; SI-NEXT:    s_or_b32 s75, s15, s11
-; SI-NEXT:    s_and_b32 s11, s24, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s55, 8
-; SI-NEXT:    v_writelane_b32 v62, s25, 62
-; SI-NEXT:    s_or_b32 s59, s11, s15
-; SI-NEXT:    s_and_b32 s11, s37, 0xff
-; SI-NEXT:    s_lshl_b32 s11, s11, 16
-; SI-NEXT:    s_lshl_b32 s15, s51, 24
-; SI-NEXT:    v_readlane_b32 s4, v62, 13
-; SI-NEXT:    s_mov_b32 s18, s21
-; SI-NEXT:    s_mov_b32 s21, s97
-; SI-NEXT:    s_mov_b32 s97, s37
-; SI-NEXT:    s_mov_b32 s37, s76
-; SI-NEXT:    s_or_b32 s76, s15, s11
-; SI-NEXT:    s_and_b32 s11, s35, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s4, 8
-; SI-NEXT:    s_or_b32 s60, s11, s15
-; SI-NEXT:    s_and_b32 s11, s77, 0xff
-; SI-NEXT:    v_readlane_b32 s4, v62, 12
-; SI-NEXT:    s_lshl_b32 s11, s11, 16
-; SI-NEXT:    s_lshl_b32 s15, s4, 24
-; SI-NEXT:    v_readlane_b32 s4, v62, 11
-; SI-NEXT:    s_mov_b32 s6, s95
-; SI-NEXT:    s_mov_b32 s95, s39
-; SI-NEXT:    s_mov_b32 s39, s89
-; SI-NEXT:    s_mov_b32 s89, s99
-; SI-NEXT:    s_mov_b32 s99, s83
-; SI-NEXT:    s_mov_b32 s83, s55
-; SI-NEXT:    s_mov_b32 s55, s64
-; SI-NEXT:    s_mov_b32 s64, s35
-; SI-NEXT:    s_mov_b32 s35, s77
-; SI-NEXT:    s_or_b32 s77, s15, s11
-; SI-NEXT:    s_and_b32 s11, s4, 0xff
-; SI-NEXT:    v_readlane_b32 s4, v62, 10
-; SI-NEXT:    s_lshl_b32 s15, s4, 8
-; SI-NEXT:    v_readlane_b32 s4, v62, 9
-; SI-NEXT:    s_or_b32 s61, s11, s15
-; SI-NEXT:    s_and_b32 s11, s4, 0xff
-; SI-NEXT:    v_readlane_b32 s4, v62, 8
-; SI-NEXT:    s_lshl_b32 s11, s11, 16
-; SI-NEXT:    s_lshl_b32 s15, s4, 24
-; SI-NEXT:    s_or_b32 s78, s15, s11
-; SI-NEXT:    v_readlane_b32 s11, v62, 7
-; SI-NEXT:    s_and_b32 s11, s11, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s17, 8
-; SI-NEXT:    s_or_b32 s11, s11, s15
-; SI-NEXT:    s_and_b32 s11, s11, 0xffff
-; SI-NEXT:    v_mov_b32_e32 v51, s9
-; SI-NEXT:    s_or_b32 s17, s11, s9
-; SI-NEXT:    v_readlane_b32 s9, v62, 2
-; SI-NEXT:    v_readlane_b32 s11, v62, 1
-; SI-NEXT:    s_and_b32 s9, s9, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s11, 8
-; SI-NEXT:    s_or_b32 s9, s9, s15
-; SI-NEXT:    s_and_b32 s9, s9, 0xffff
-; SI-NEXT:    s_mov_b32 s4, s96
-; SI-NEXT:    s_mov_b32 s96, s24
-; SI-NEXT:    v_mov_b32_e32 v52, s14
-; SI-NEXT:    s_or_b32 s24, s9, s14
-; SI-NEXT:    s_and_b32 s14, s93, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s84, 8
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    s_or_b32 s14, s14, s15
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v53, v6, v1
-; SI-NEXT:    s_and_b32 s14, s14, 0xffff
-; SI-NEXT:    v_or_b32_e32 v47, s14, v53
-; SI-NEXT:    s_and_b32 s14, s8, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s49, 8
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v13
-; SI-NEXT:    s_or_b32 s14, s14, s15
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v54, v14, v1
-; SI-NEXT:    s_and_b32 s14, s14, 0xffff
-; SI-NEXT:    v_or_b32_e32 v45, s14, v54
-; SI-NEXT:    s_and_b32 s14, s40, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s53, 8
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v21
-; SI-NEXT:    s_or_b32 s14, s14, s15
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v55, v18, v1
-; SI-NEXT:    s_and_b32 s14, s14, 0xffff
-; SI-NEXT:    v_or_b32_e32 v11, s14, v55
-; SI-NEXT:    s_and_b32 s14, s34, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s23, 8
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v29
-; SI-NEXT:    s_or_b32 s14, s14, s15
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v40, v19, v1
-; SI-NEXT:    s_and_b32 s14, s14, 0xffff
-; SI-NEXT:    v_or_b32_e32 v50, s14, v40
-; SI-NEXT:    s_and_b32 s14, s91, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s66, 8
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v5
-; SI-NEXT:    s_or_b32 s14, s14, s15
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v41, v20, v1
-; SI-NEXT:    s_and_b32 s14, s14, 0xffff
-; SI-NEXT:    v_or_b32_e32 v17, s14, v41
-; SI-NEXT:    s_and_b32 s14, s50, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s54, 8
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v32
-; SI-NEXT:    s_or_b32 s14, s14, s15
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v42, v22, v1
-; SI-NEXT:    s_and_b32 s14, s14, 0xffff
-; SI-NEXT:    v_or_b32_e32 v16, s14, v42
-; SI-NEXT:    s_and_b32 s14, s73, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s36, 8
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v33
-; SI-NEXT:    s_or_b32 s14, s14, s15
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v59, v23, v1
-; SI-NEXT:    s_and_b32 s14, s14, 0xffff
-; SI-NEXT:    v_or_b32_e32 v15, s14, v59
-; SI-NEXT:    s_and_b32 s14, s82, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s52, 8
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v34
-; SI-NEXT:    s_or_b32 s14, s14, s15
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v60, v24, v1
-; SI-NEXT:    s_and_b32 s14, s14, 0xffff
-; SI-NEXT:    v_or_b32_e32 v12, s14, v60
-; SI-NEXT:    s_and_b32 s14, s90, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s16, 8
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v35
-; SI-NEXT:    s_or_b32 s14, s14, s15
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v2, v25, v1
-; SI-NEXT:    s_and_b32 s14, s14, 0xffff
-; SI-NEXT:    v_or_b32_e32 v10, s14, v2
-; SI-NEXT:    s_and_b32 s14, s62, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s65, 8
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v36
-; SI-NEXT:    s_or_b32 s14, s14, s15
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v61, v26, v1
-; SI-NEXT:    s_and_b32 s14, s14, 0xffff
-; SI-NEXT:    v_or_b32_e32 v8, s14, v61
-; SI-NEXT:    s_and_b32 s14, s98, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s67, 8
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v37
-; SI-NEXT:    s_or_b32 s14, s14, s15
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_or_b32_e32 v6, v27, v1
-; SI-NEXT:    s_and_b32 s14, s14, 0xffff
-; SI-NEXT:    v_or_b32_e32 v7, s14, v6
-; SI-NEXT:    s_and_b32 s14, s92, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s7, 8
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v38
-; SI-NEXT:    s_or_b32 s14, s14, s15
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_readlane_b32 s8, v62, 55
-; SI-NEXT:    v_mov_b32_e32 v28, v38
-; SI-NEXT:    v_mov_b32_e32 v38, v25
-; SI-NEXT:    v_mov_b32_e32 v25, v35
-; SI-NEXT:    v_mov_b32_e32 v35, v22
-; SI-NEXT:    v_mov_b32_e32 v22, v32
-; SI-NEXT:    v_mov_b32_e32 v32, v18
-; SI-NEXT:    v_mov_b32_e32 v18, v13
-; SI-NEXT:    v_or_b32_e32 v13, v46, v1
-; SI-NEXT:    s_and_b32 s14, s14, 0xffff
-; SI-NEXT:    s_and_b32 s8, s8, 0xffff
-; SI-NEXT:    v_or_b32_e32 v4, s14, v13
-; SI-NEXT:    s_and_b32 s14, s70, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s94, 8
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v39
-; SI-NEXT:    s_or_b32 s42, s8, s63
-; SI-NEXT:    v_readlane_b32 s8, v62, 56
-; SI-NEXT:    s_or_b32 s14, s14, s15
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_and_b32 s8, s8, 0xffff
-; SI-NEXT:    v_mov_b32_e32 v48, v27
-; SI-NEXT:    v_mov_b32_e32 v27, v37
-; SI-NEXT:    v_mov_b32_e32 v37, v24
-; SI-NEXT:    v_mov_b32_e32 v24, v34
-; SI-NEXT:    v_mov_b32_e32 v34, v20
-; SI-NEXT:    v_mov_b32_e32 v20, v19
-; SI-NEXT:    v_mov_b32_e32 v19, v29
-; SI-NEXT:    v_or_b32_e32 v29, v30, v1
-; SI-NEXT:    s_and_b32 s14, s14, 0xffff
-; SI-NEXT:    s_or_b32 s40, s8, s5
-; SI-NEXT:    v_readlane_b32 s8, v62, 57
-; SI-NEXT:    v_or_b32_e32 v31, s14, v29
-; SI-NEXT:    s_and_b32 s14, s88, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s79, 8
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v43
-; SI-NEXT:    s_and_b32 s8, s8, 0xffff
-; SI-NEXT:    v_readlane_b32 s9, v62, 60
-; SI-NEXT:    s_or_b32 s14, s14, s15
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    s_or_b32 s15, s8, s41
-; SI-NEXT:    v_readlane_b32 s8, v62, 58
-; SI-NEXT:    s_and_b32 s16, s9, 0xffff
-; SI-NEXT:    v_mov_b32_e32 v9, v30
-; SI-NEXT:    v_mov_b32_e32 v30, v39
-; SI-NEXT:    v_mov_b32_e32 v39, v26
-; SI-NEXT:    v_mov_b32_e32 v26, v36
-; SI-NEXT:    v_mov_b32_e32 v36, v23
-; SI-NEXT:    v_mov_b32_e32 v23, v33
-; SI-NEXT:    v_mov_b32_e32 v33, v5
-; SI-NEXT:    v_mov_b32_e32 v5, v14
-; SI-NEXT:    v_or_b32_e32 v14, v44, v3
-; SI-NEXT:    s_and_b32 s14, s14, 0xffff
-; SI-NEXT:    s_and_b32 s8, s8, 0xffff
-; SI-NEXT:    s_or_b32 s36, s16, s56
-; SI-NEXT:    s_and_b32 s16, s48, 0xffff
-; SI-NEXT:    v_or_b32_e32 v49, s14, v14
-; SI-NEXT:    s_or_b32 s14, s8, s43
-; SI-NEXT:    s_and_b32 s8, s13, 0xffff
-; SI-NEXT:    s_or_b32 s53, s16, vcc_lo
-; SI-NEXT:    s_and_b32 s16, s72, 0xffff
-; SI-NEXT:    s_or_b32 s13, s8, s44
-; SI-NEXT:    s_and_b32 s8, s12, 0xffff
-; SI-NEXT:    s_or_b32 s94, s16, vcc_hi
-; SI-NEXT:    s_and_b32 s16, s57, 0xffff
-; SI-NEXT:    s_or_b32 s12, s8, s45
-; SI-NEXT:    s_and_b32 s8, s10, 0xffff
-; SI-NEXT:    s_or_b32 s49, s16, s74
-; SI-NEXT:    s_and_b32 s16, s58, 0xffff
-; SI-NEXT:    s_or_b32 s10, s8, s46
-; SI-NEXT:    v_readlane_b32 s8, v62, 59
-; SI-NEXT:    s_or_b32 s48, s16, s75
-; SI-NEXT:    s_and_b32 s16, s59, 0xffff
-; SI-NEXT:    s_and_b32 s8, s8, 0xffff
-; SI-NEXT:    s_or_b32 s11, s16, s76
-; SI-NEXT:    s_and_b32 s16, s60, 0xffff
-; SI-NEXT:    s_and_b32 s23, s61, 0xffff
-; SI-NEXT:    s_mov_b32 s30, s87
-; SI-NEXT:    s_mov_b32 s87, s85
-; SI-NEXT:    s_or_b32 s8, s8, s47
-; SI-NEXT:    s_or_b32 s9, s16, s77
-; SI-NEXT:    s_or_b32 s16, s23, s78
-; SI-NEXT:    v_alignbit_b32 v57, s42, v51, 16
-; SI-NEXT:    v_alignbit_b32 v58, s40, v52, 16
-; SI-NEXT:    v_alignbit_b32 v56, s15, v53, 16
-; SI-NEXT:    v_alignbit_b32 v51, s14, v54, 16
-; SI-NEXT:    v_alignbit_b32 v1, s13, v55, 16
-; SI-NEXT:    v_alignbit_b32 v3, s12, v40, 16
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_alignbit_b32 v44, s10, v41, 16
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v43, s8, v42, 16
-; SI-NEXT:    v_alignbit_b32 v42, s36, v59, 16
-; SI-NEXT:    v_alignbit_b32 v41, s53, v60, 16
-; SI-NEXT:    v_alignbit_b32 v40, s94, v2, 16
-; SI-NEXT:    v_mov_b32_e32 v2, v4
-; SI-NEXT:    v_mov_b32_e32 v4, v7
-; SI-NEXT:    v_mov_b32_e32 v7, v8
-; SI-NEXT:    v_mov_b32_e32 v8, v10
-; SI-NEXT:    v_mov_b32_e32 v10, v15
-; SI-NEXT:    v_mov_b32_e32 v15, v17
-; SI-NEXT:    v_mov_b32_e32 v17, v11
-; SI-NEXT:    v_alignbit_b32 v55, s49, v61, 16
-; SI-NEXT:    v_alignbit_b32 v54, s48, v6, 16
-; SI-NEXT:    v_alignbit_b32 v53, s11, v13, 16
-; SI-NEXT:    v_mov_b32_e32 v13, v18
-; SI-NEXT:    v_mov_b32_e32 v18, v32
-; SI-NEXT:    v_mov_b32_e32 v32, v22
-; SI-NEXT:    v_mov_b32_e32 v22, v35
-; SI-NEXT:    v_mov_b32_e32 v35, v25
-; SI-NEXT:    v_mov_b32_e32 v25, v38
-; SI-NEXT:    v_mov_b32_e32 v38, v28
-; SI-NEXT:    v_alignbit_b32 v52, s9, v29, 16
-; SI-NEXT:    v_mov_b32_e32 v29, v19
-; SI-NEXT:    v_mov_b32_e32 v19, v20
-; SI-NEXT:    v_mov_b32_e32 v20, v34
-; SI-NEXT:    v_mov_b32_e32 v34, v24
-; SI-NEXT:    v_mov_b32_e32 v24, v37
-; SI-NEXT:    v_mov_b32_e32 v37, v27
-; SI-NEXT:    v_mov_b32_e32 v27, v48
-; SI-NEXT:    v_alignbit_b32 v28, s16, v14, 16
-; SI-NEXT:    v_mov_b32_e32 v14, v5
-; SI-NEXT:    v_mov_b32_e32 v5, v33
-; SI-NEXT:    v_mov_b32_e32 v33, v23
-; SI-NEXT:    v_mov_b32_e32 v23, v36
-; SI-NEXT:    v_mov_b32_e32 v36, v26
-; SI-NEXT:    v_mov_b32_e32 v26, v39
-; SI-NEXT:    v_mov_b32_e32 v39, v30
-; SI-NEXT:    v_mov_b32_e32 v30, v9
-; SI-NEXT:    v_mov_b32_e32 v9, v12
-; SI-NEXT:    v_mov_b32_e32 v12, v16
-; SI-NEXT:    v_mov_b32_e32 v16, v50
-; SI-NEXT:    s_lshr_b32 s73, s63, 16
-; SI-NEXT:    s_lshr_b32 s72, s5, 16
-; SI-NEXT:    s_lshr_b32 s63, s41, 16
-; SI-NEXT:    s_lshr_b32 s62, s43, 16
-; SI-NEXT:    s_lshr_b32 s61, s44, 16
-; SI-NEXT:    s_lshr_b32 s60, s45, 16
-; SI-NEXT:    s_lshr_b32 s59, s46, 16
-; SI-NEXT:    s_lshr_b32 s58, s47, 16
-; SI-NEXT:    s_lshr_b32 s57, s56, 16
-; SI-NEXT:    s_lshr_b32 s56, vcc_lo, 16
-; SI-NEXT:    s_lshr_b32 s47, vcc_hi, 16
-; SI-NEXT:    s_lshr_b32 s46, s74, 16
-; SI-NEXT:    v_readlane_b32 s25, v62, 62
-; SI-NEXT:    v_readlane_b32 s74, v62, 61
-; SI-NEXT:    s_lshr_b32 s45, s75, 16
-; SI-NEXT:    s_lshr_b32 s44, s76, 16
-; SI-NEXT:    s_mov_b32 s76, s37
-; SI-NEXT:    s_mov_b32 s37, s97
-; SI-NEXT:    s_mov_b32 s97, s21
-; SI-NEXT:    s_mov_b32 s21, s18
-; SI-NEXT:    s_mov_b32 s18, s17
-; SI-NEXT:    s_mov_b32 s85, s87
-; SI-NEXT:    s_mov_b32 s87, s30
-; SI-NEXT:    s_mov_b32 s17, s24
-; SI-NEXT:    s_lshr_b32 s43, s77, 16
-; SI-NEXT:    s_mov_b32 s77, s35
-; SI-NEXT:    s_mov_b32 s35, s64
-; SI-NEXT:    s_mov_b32 s64, s55
-; SI-NEXT:    s_mov_b32 s55, s83
-; SI-NEXT:    s_mov_b32 s83, s99
-; SI-NEXT:    s_mov_b32 s99, s89
-; SI-NEXT:    s_mov_b32 s89, s39
-; SI-NEXT:    s_mov_b32 s39, s95
-; SI-NEXT:    s_mov_b32 s95, s6
-; SI-NEXT:    s_lshr_b32 s41, s78, 16
-; SI-NEXT:    s_mov_b32 s24, s96
-; SI-NEXT:    s_mov_b32 s96, s4
-; SI-NEXT:    s_cbranch_execnz .LBB97_3
-; SI-NEXT:  .LBB97_2: ; %cmp.true
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; SI-NEXT:    s_add_i32 s4, s88, 3
-; SI-NEXT:    s_and_b32 s4, s4, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s79, 8
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_readlane_b32 s5, v62, 10
-; SI-NEXT:    v_readlane_b32 s6, v62, 9
-; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    s_add_i32 s8, s6, 3
-; SI-NEXT:    s_and_b32 s8, s8, 0xff
-; SI-NEXT:    s_lshl_b32 s8, s8, 16
-; SI-NEXT:    s_add_i32 s70, s70, 3
-; SI-NEXT:    v_readlane_b32 s6, v62, 30
-; SI-NEXT:    s_add_i32 s9, s77, 3
-; SI-NEXT:    s_and_b32 s9, s9, 0xff
-; SI-NEXT:    s_lshl_b32 s9, s9, 16
-; SI-NEXT:    s_add_i32 s79, s92, 3
-; SI-NEXT:    s_add_i32 s52, s98, 3
+; SI-NEXT:    v_readfirstlane_b32 s35, v48
+; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    v_readfirstlane_b32 s83, v49
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_readfirstlane_b32 s87, v50
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_readfirstlane_b32 s63, v51
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 3, v37
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_or_b32_e32 v4, v27, v4
-; SI-NEXT:    s_add_i32 s30, s87, 3
+; SI-NEXT:    v_readfirstlane_b32 s74, v31
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 3, v36
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_or_b32_e32 v7, v26, v7
-; SI-NEXT:    s_add_i32 s50, s90, 3
+; SI-NEXT:    v_readfirstlane_b32 s81, v32
 ; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 3, v35
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_or_b32_e32 v8, v25, v8
-; SI-NEXT:    s_add_i32 s94, s86, 3
+; SI-NEXT:    v_readfirstlane_b32 s80, v33
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v34
-; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_or_b32_e32 v9, v24, v9
+; SI-NEXT:    v_readfirstlane_b32 s86, v34
 ; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 3, v33
-; SI-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT:    v_or_b32_e32 v10, v23, v10
+; SI-NEXT:    v_readfirstlane_b32 s34, v35
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 3, v32
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_or_b32_e32 v11, v22, v11
+; SI-NEXT:    v_readfirstlane_b32 s84, v36
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v5
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v5, v20, v5
+; SI-NEXT:    v_readfirstlane_b32 s31, v37
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_readfirstlane_b32 s61, v38
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_readfirstlane_b32 s62, v39
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_readfirstlane_b32 s53, v48
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v1
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    v_or_b32_e32 v1, s4, v1
-; SI-NEXT:    v_readlane_b32 s4, v62, 11
-; SI-NEXT:    s_add_i32 s4, s4, 3
-; SI-NEXT:    s_and_b32 s4, s4, 0xff
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_readlane_b32 s5, v62, 8
-; SI-NEXT:    s_lshl_b32 s5, s5, 24
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_or_b32 s5, s5, s8
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_and_b32 s5, s70, 0xff
-; SI-NEXT:    s_lshl_b32 s8, s6, 8
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v39
-; SI-NEXT:    s_or_b32 s5, s8, s5
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    s_addk_i32 s5, 0x300
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    s_and_b32 s5, s5, 0xffff
-; SI-NEXT:    v_or_b32_e32 v2, v30, v2
-; SI-NEXT:    v_or_b32_e32 v2, s5, v2
-; SI-NEXT:    s_add_i32 s5, s35, 3
-; SI-NEXT:    v_readlane_b32 s6, v62, 13
-; SI-NEXT:    s_and_b32 s5, s5, 0xff
-; SI-NEXT:    s_lshl_b32 s8, s6, 8
-; SI-NEXT:    s_or_b32 s5, s8, s5
-; SI-NEXT:    v_readlane_b32 s6, v62, 12
-; SI-NEXT:    s_lshl_b32 s8, s6, 24
-; SI-NEXT:    s_addk_i32 s5, 0x300
-; SI-NEXT:    s_or_b32 s8, s8, s9
-; SI-NEXT:    s_and_b32 s5, s5, 0xffff
-; SI-NEXT:    s_or_b32 s5, s8, s5
-; SI-NEXT:    s_add_i32 s16, s4, 0x3000000
-; SI-NEXT:    v_add_i32_e32 v31, vcc, 0x3000000, v2
-; SI-NEXT:    s_add_i32 s9, s5, 0x3000000
-; SI-NEXT:    s_and_b32 s4, s79, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s7, 8
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v38
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v2, v46, v2
-; SI-NEXT:    v_or_b32_e32 v2, s4, v2
-; SI-NEXT:    s_add_i32 s4, s24, 3
+; SI-NEXT:    v_readfirstlane_b32 s52, v49
+; SI-NEXT:    v_writelane_b32 v42, s52, 7
+; SI-NEXT:    v_writelane_b32 v42, s53, 8
+; SI-NEXT:    v_writelane_b32 v42, s57, 9
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_readfirstlane_b32 s54, v50
+; SI-NEXT:    v_writelane_b32 v42, s58, 10
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_readfirstlane_b32 s55, v51
+; SI-NEXT:    v_writelane_b32 v42, s54, 11
+; SI-NEXT:    v_writelane_b32 v42, s55, 12
+; SI-NEXT:    v_writelane_b32 v42, s64, 13
+; SI-NEXT:    v_writelane_b32 v42, s65, 14
+; SI-NEXT:    v_writelane_b32 v42, s67, 15
+; SI-NEXT:    v_writelane_b32 v42, s71, 16
+; SI-NEXT:    v_writelane_b32 v42, s80, 17
+; SI-NEXT:    v_writelane_b32 v42, s81, 18
+; SI-NEXT:    v_writelane_b32 v42, s59, 19
+; SI-NEXT:    v_writelane_b32 v42, s60, 20
+; SI-NEXT:    v_writelane_b32 v42, s86, 21
+; SI-NEXT:    v_writelane_b32 v42, s97, 22
+; SI-NEXT:    v_writelane_b32 v42, s34, 23
+; SI-NEXT:    v_writelane_b32 v42, s66, 24
+; SI-NEXT:    v_writelane_b32 v42, s85, 25
+; SI-NEXT:    v_writelane_b32 v42, s31, 26
+; SI-NEXT:    v_writelane_b32 v42, s84, 27
+; SI-NEXT:    v_writelane_b32 v42, s35, 28
+; SI-NEXT:    v_writelane_b32 v42, s98, 29
+; SI-NEXT:    v_writelane_b32 v42, s17, 30
+; SI-NEXT:    v_writelane_b32 v42, s20, 31
+; SI-NEXT:    v_writelane_b32 v42, s61, 32
+; SI-NEXT:    v_writelane_b32 v42, s19, 33
+; SI-NEXT:    v_writelane_b32 v42, s62, 34
+; SI-NEXT:    v_writelane_b32 v42, s23, 35
+; SI-NEXT:    v_writelane_b32 v42, s83, 36
+; SI-NEXT:    v_writelane_b32 v42, s87, 37
+; SI-NEXT:    v_writelane_b32 v42, s26, 38
+; SI-NEXT:    v_writelane_b32 v42, s94, 39
+; SI-NEXT:    v_writelane_b32 v42, s27, 40
+; SI-NEXT:    v_writelane_b32 v42, s63, 41
+; SI-NEXT:    v_writelane_b32 v42, s79, 42
+; SI-NEXT:    v_writelane_b32 v42, s88, 43
+; SI-NEXT:    v_writelane_b32 v42, s72, 44
+; SI-NEXT:    v_writelane_b32 v42, s73, 45
+; SI-NEXT:    v_writelane_b32 v42, s74, 46
+; SI-NEXT:    v_writelane_b32 v42, s75, 47
+; SI-NEXT:    v_writelane_b32 v42, s24, 48
+; SI-NEXT:    v_writelane_b32 v42, s25, 49
+; SI-NEXT:    v_writelane_b32 v42, s28, 50
+; SI-NEXT:    s_cbranch_scc0 .LBB97_4
+; SI-NEXT:  ; %bb.1: ; %cmp.false
+; SI-NEXT:    v_readlane_b32 s4, v43, 13
+; SI-NEXT:    v_readlane_b32 s5, v43, 12
 ; SI-NEXT:    s_and_b32 s4, s4, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s55, 8
-; SI-NEXT:    s_add_i32 s8, s37, 3
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_and_b32 s8, s8, 0xff
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s51, 24
-; SI-NEXT:    s_lshl_b32 s8, s8, 16
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s8
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s11, s4, 0x3000000
-; SI-NEXT:    s_and_b32 s4, s52, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s67, 8
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v4, s4, v4
-; SI-NEXT:    s_and_b32 s4, s30, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s21, 8
-; SI-NEXT:    s_add_i32 s8, s68, 3
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_and_b32 s8, s8, 0xff
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s28, 24
-; SI-NEXT:    s_lshl_b32 s8, s8, 16
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s8
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s48, s4, 0x3000000
-; SI-NEXT:    v_readlane_b32 s4, v62, 42
-; SI-NEXT:    s_add_i32 s87, s4, 3
-; SI-NEXT:    v_readlane_b32 s5, v62, 39
-; SI-NEXT:    s_and_b32 s4, s87, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v7, s4, v7
-; SI-NEXT:    v_readlane_b32 s4, v62, 32
-; SI-NEXT:    s_add_i32 s67, s4, 3
-; SI-NEXT:    s_and_b32 s4, s67, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s85, 8
-; SI-NEXT:    s_add_i32 s8, s69, 3
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_and_b32 s8, s8, 0xff
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s74, 24
-; SI-NEXT:    s_lshl_b32 s8, s8, 16
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s8
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_readlane_b32 s5, v62, 23
-; SI-NEXT:    s_add_i32 s49, s4, 0x3000000
-; SI-NEXT:    s_and_b32 s4, s50, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v8, s4, v8
-; SI-NEXT:    s_and_b32 s4, s94, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s38, 8
-; SI-NEXT:    s_add_i32 s8, s71, 3
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_and_b32 s8, s8, 0xff
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s81, 24
-; SI-NEXT:    s_lshl_b32 s8, s8, 16
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s8
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s94, s4, 0x3000000
-; SI-NEXT:    v_readlane_b32 s4, v62, 52
-; SI-NEXT:    s_add_i32 s18, s4, 3
-; SI-NEXT:    v_readlane_b32 s5, v62, 51
-; SI-NEXT:    s_and_b32 s4, s18, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v9, s4, v9
-; SI-NEXT:    v_readlane_b32 s4, v62, 45
-; SI-NEXT:    s_add_i32 s98, s4, 3
-; SI-NEXT:    v_readlane_b32 s5, v62, 43
-; SI-NEXT:    v_readlane_b32 s6, v62, 14
-; SI-NEXT:    s_and_b32 s4, s98, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    s_add_i32 s8, s6, 3
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_and_b32 s8, s8, 0xff
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s31, 24
-; SI-NEXT:    s_lshl_b32 s8, s8, 16
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s8
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s53, s4, 0x3000000
-; SI-NEXT:    v_readlane_b32 s4, v62, 41
-; SI-NEXT:    s_add_i32 s86, s4, 3
-; SI-NEXT:    v_readlane_b32 s5, v62, 38
-; SI-NEXT:    s_and_b32 s4, s86, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v10, s4, v10
-; SI-NEXT:    v_readlane_b32 s4, v62, 31
-; SI-NEXT:    s_add_i32 s66, s4, 3
-; SI-NEXT:    v_readlane_b32 s5, v62, 27
-; SI-NEXT:    s_and_b32 s4, s66, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    s_add_i32 s37, s39, 3
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_and_b32 s8, s37, 0xff
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s95, 24
-; SI-NEXT:    s_lshl_b32 s8, s8, 16
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s8
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s36, s4, 0x3000000
-; SI-NEXT:    v_readlane_b32 s4, v62, 50
-; SI-NEXT:    s_add_i32 s21, s4, 3
-; SI-NEXT:    v_readlane_b32 s5, v62, 49
-; SI-NEXT:    s_and_b32 s4, s21, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v11, s4, v11
-; SI-NEXT:    v_readlane_b32 s4, v62, 40
-; SI-NEXT:    s_add_i32 s85, s4, 3
-; SI-NEXT:    v_readlane_b32 s5, v62, 37
-; SI-NEXT:    s_and_b32 s4, s85, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    s_add_i32 s71, s22, 3
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_readlane_b32 s5, v62, 33
-; SI-NEXT:    s_and_b32 s8, s71, 0xff
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s5, 24
-; SI-NEXT:    s_lshl_b32 s8, s8, 16
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s8
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s8, s4, 0x3000000
-; SI-NEXT:    v_readlane_b32 s4, v62, 54
-; SI-NEXT:    s_add_i32 s17, s4, 3
-; SI-NEXT:    v_readlane_b32 s5, v62, 53
-; SI-NEXT:    s_and_b32 s4, s17, 0xff
+; SI-NEXT:    s_or_b32 s29, s4, s5
+; SI-NEXT:    v_readlane_b32 s4, v43, 5
+; SI-NEXT:    v_readlane_b32 s5, v43, 4
+; SI-NEXT:    s_and_b32 s4, s4, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v5, s4, v5
-; SI-NEXT:    v_readlane_b32 s4, v62, 48
-; SI-NEXT:    s_add_i32 s7, s4, 3
-; SI-NEXT:    v_readlane_b32 s5, v62, 47
+; SI-NEXT:    s_or_b32 s45, s4, s5
+; SI-NEXT:    s_and_b32 s4, s43, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s42, 8
+; SI-NEXT:    s_or_b32 s46, s4, s5
+; SI-NEXT:    s_and_b32 s4, s56, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s47, 8
+; SI-NEXT:    s_or_b32 s47, s4, s5
+; SI-NEXT:    s_and_b32 s4, s58, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s57, 8
+; SI-NEXT:    s_or_b32 s56, s4, s5
+; SI-NEXT:    s_and_b32 s4, s60, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s59, 8
+; SI-NEXT:    s_or_b32 s57, s4, s5
+; SI-NEXT:    s_and_b32 s4, s62, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s61, 8
+; SI-NEXT:    s_or_b32 s58, s4, s5
+; SI-NEXT:    s_and_b32 s4, s74, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s63, 8
+; SI-NEXT:    s_or_b32 s59, s4, s5
+; SI-NEXT:    s_and_b32 s4, s73, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s72, 8
+; SI-NEXT:    s_or_b32 s60, s4, s5
+; SI-NEXT:    s_and_b32 s4, s24, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s75, 8
+; SI-NEXT:    s_or_b32 s61, s4, s5
+; SI-NEXT:    s_and_b32 s4, s28, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s25, 8
+; SI-NEXT:    s_or_b32 s62, s4, s5
+; SI-NEXT:    s_and_b32 s4, s36, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s16, 8
+; SI-NEXT:    s_or_b32 s63, s4, s5
+; SI-NEXT:    s_and_b32 s4, s10, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s9, 8
+; SI-NEXT:    s_or_b32 s72, s4, s5
 ; SI-NEXT:    s_and_b32 s4, s7, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    s_add_i32 s35, s99, 3
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_and_b32 s6, s35, 0xff
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s89, 24
-; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s6
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s10, s4, 0x3000000
-; SI-NEXT:    v_readlane_b32 s4, v62, 46
-; SI-NEXT:    s_add_i32 s99, s4, 3
-; SI-NEXT:    v_readlane_b32 s5, v62, 44
-; SI-NEXT:    v_add_i32_e32 v15, vcc, 0x3000000, v5
-; SI-NEXT:    s_and_b32 s4, s99, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v29
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v5, v19, v5
-; SI-NEXT:    v_or_b32_e32 v5, s4, v5
-; SI-NEXT:    v_readlane_b32 s4, v62, 36
-; SI-NEXT:    s_add_i32 s81, s4, 3
-; SI-NEXT:    v_readlane_b32 s5, v62, 35
-; SI-NEXT:    v_readlane_b32 s6, v62, 28
-; SI-NEXT:    s_and_b32 s4, s81, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    s_add_i32 s55, s6, 3
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_readlane_b32 s5, v62, 26
-; SI-NEXT:    s_and_b32 s6, s55, 0xff
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s5, 24
-; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s6
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s12, s4, 0x3000000
-; SI-NEXT:    v_readlane_b32 s4, v62, 34
-; SI-NEXT:    s_add_i32 s69, s4, 3
-; SI-NEXT:    v_readlane_b32 s5, v62, 29
-; SI-NEXT:    v_add_i32_e32 v16, vcc, 0x3000000, v5
-; SI-NEXT:    s_and_b32 s4, s69, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v21
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v5, v18, v5
-; SI-NEXT:    v_or_b32_e32 v5, s4, v5
-; SI-NEXT:    v_add_i32_e32 v17, vcc, 0x3000000, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v13
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 3, v6
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; SI-NEXT:    v_readlane_b32 s4, v62, 22
-; SI-NEXT:    s_add_i32 s34, s4, 3
-; SI-NEXT:    v_readlane_b32 s5, v62, 21
-; SI-NEXT:    v_readlane_b32 s6, v62, 19
-; SI-NEXT:    s_and_b32 s4, s34, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    s_add_i32 s92, s6, 3
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_and_b32 s6, s92, 0xff
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s97, 24
-; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s6
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s13, s4, 0x3000000
-; SI-NEXT:    v_readlane_b32 s4, v62, 25
-; SI-NEXT:    s_add_i32 s51, s4, 3
-; SI-NEXT:    v_readlane_b32 s5, v62, 24
-; SI-NEXT:    s_and_b32 s4, s51, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v5, v14, v5
-; SI-NEXT:    v_or_b32_e32 v5, s4, v5
-; SI-NEXT:    v_readlane_b32 s4, v62, 20
-; SI-NEXT:    s_add_i32 s95, s4, 3
-; SI-NEXT:    v_readlane_b32 s5, v62, 18
-; SI-NEXT:    s_and_b32 s4, s95, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    s_add_i32 s6, s96, 3
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_and_b32 s6, s6, 0xff
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s80, 24
-; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s6
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s14, s4, 0x3000000
-; SI-NEXT:    s_add_i32 s4, s93, 3
+; SI-NEXT:    s_lshl_b32 s5, s78, 8
+; SI-NEXT:    s_or_b32 s73, s4, s5
+; SI-NEXT:    s_and_b32 s4, s8, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s91, 8
+; SI-NEXT:    s_or_b32 s74, s4, s5
+; SI-NEXT:    s_and_b32 s4, s6, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s90, 8
+; SI-NEXT:    s_or_b32 s75, s4, s5
+; SI-NEXT:    v_readlane_b32 s4, v43, 9
+; SI-NEXT:    v_readlane_b32 s5, v43, 8
 ; SI-NEXT:    s_and_b32 s4, s4, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s84, 8
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s25, 8
-; SI-NEXT:    s_add_i32 s6, s64, 3
-; SI-NEXT:    s_and_b32 s6, s6, 0xff
-; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    v_add_i32_e32 v49, vcc, 0x3000000, v1
-; SI-NEXT:    v_mov_b32_e32 v30, s16
-; SI-NEXT:    v_mov_b32_e32 v39, s9
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x3000000, v2
-; SI-NEXT:    v_mov_b32_e32 v28, s11
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x3000000, v4
-; SI-NEXT:    v_mov_b32_e32 v27, s48
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x3000000, v7
-; SI-NEXT:    v_mov_b32_e32 v26, s49
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x3000000, v8
-; SI-NEXT:    v_mov_b32_e32 v25, s94
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 0x3000000, v9
-; SI-NEXT:    v_mov_b32_e32 v24, s53
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 0x3000000, v10
-; SI-NEXT:    v_mov_b32_e32 v23, s36
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 0x3000000, v11
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_or_b32_e32 v6, v6, v13
-; SI-NEXT:    v_or_b32_e32 v6, s4, v6
-; SI-NEXT:    s_add_i32 s4, s83, 3
+; SI-NEXT:    s_lshl_b32 s5, s5, 8
+; SI-NEXT:    s_or_b32 s5, s4, s5
+; SI-NEXT:    v_readlane_b32 s4, v43, 7
 ; SI-NEXT:    s_and_b32 s4, s4, 0xff
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_readlane_b32 s5, v62, 15
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s5, 24
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s6
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s15, s4, 0x3000000
-; SI-NEXT:    v_readlane_b32 s4, v62, 2
-; SI-NEXT:    s_add_i32 s4, s4, 3
-; SI-NEXT:    v_readlane_b32 s5, v62, 1
+; SI-NEXT:    v_readlane_b32 s6, v43, 6
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    s_lshl_b32 s6, s6, 24
+; SI-NEXT:    s_or_b32 s7, s6, s4
+; SI-NEXT:    v_readlane_b32 s4, v43, 11
 ; SI-NEXT:    s_and_b32 s4, s4, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    s_add_i32 s6, s26, 3
-; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    v_readlane_b32 s6, v43, 10
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    s_lshl_b32 s6, s6, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    v_writelane_b32 v42, s7, 51
+; SI-NEXT:    s_or_b32 s4, s6, s4
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    v_readlane_b32 s6, v43, 1
+; SI-NEXT:    v_readlane_b32 s7, v43, 0
 ; SI-NEXT:    s_and_b32 s6, s6, 0xff
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s27, 24
+; SI-NEXT:    s_lshl_b32 s7, s7, 8
+; SI-NEXT:    s_or_b32 s7, s6, s7
+; SI-NEXT:    s_and_b32 s6, s11, 0xff
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s6
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s17, s4, 0x3000000
-; SI-NEXT:    v_readlane_b32 s4, v62, 0
-; SI-NEXT:    s_add_i32 s4, s4, 3
-; SI-NEXT:    s_and_b32 s4, s4, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s29, 8
-; SI-NEXT:    s_add_i32 s6, s76, 3
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_readlane_b32 s5, v62, 16
+; SI-NEXT:    s_lshl_b32 s8, s12, 24
+; SI-NEXT:    s_or_b32 s37, s8, s6
+; SI-NEXT:    v_readlane_b32 s6, v43, 3
 ; SI-NEXT:    s_and_b32 s6, s6, 0xff
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s5, 24
+; SI-NEXT:    v_readlane_b32 s8, v43, 2
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s6
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s40, s4, 0x3000000
-; SI-NEXT:    v_readlane_b32 s4, v62, 7
+; SI-NEXT:    s_lshl_b32 s8, s8, 24
+; SI-NEXT:    s_or_b32 s6, s8, s6
+; SI-NEXT:    s_and_b32 s8, s18, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s15, 8
+; SI-NEXT:    s_or_b32 s9, s8, s9
+; SI-NEXT:    s_and_b32 s8, s13, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_lshl_b32 s10, s14, 24
+; SI-NEXT:    s_or_b32 s68, s10, s8
+; SI-NEXT:    s_and_b32 s8, s21, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_lshl_b32 s10, s22, 24
+; SI-NEXT:    s_or_b32 s8, s10, s8
+; SI-NEXT:    s_and_b32 s10, s77, 0xff
+; SI-NEXT:    s_lshl_b32 s11, s76, 8
+; SI-NEXT:    s_or_b32 s11, s10, s11
+; SI-NEXT:    s_and_b32 s10, s40, 0xff
+; SI-NEXT:    s_lshl_b32 s10, s10, 16
+; SI-NEXT:    s_lshl_b32 s12, s41, 24
+; SI-NEXT:    s_or_b32 s99, s12, s10
+; SI-NEXT:    s_and_b32 s10, vcc_lo, 0xff
+; SI-NEXT:    s_lshl_b32 s10, s10, 16
+; SI-NEXT:    s_lshl_b32 s12, vcc_hi, 24
+; SI-NEXT:    s_or_b32 s10, s12, s10
+; SI-NEXT:    s_and_b32 s12, s49, 0xff
+; SI-NEXT:    s_lshl_b32 s13, s48, 8
+; SI-NEXT:    s_or_b32 s13, s12, s13
+; SI-NEXT:    s_and_b32 s12, s38, 0xff
+; SI-NEXT:    s_lshl_b32 s12, s12, 16
+; SI-NEXT:    s_lshl_b32 s14, s39, 24
+; SI-NEXT:    s_or_b32 s92, s14, s12
+; SI-NEXT:    s_and_b32 s12, s50, 0xff
+; SI-NEXT:    s_lshl_b32 s12, s12, 16
+; SI-NEXT:    s_lshl_b32 s14, s51, 24
+; SI-NEXT:    s_or_b32 s12, s14, s12
+; SI-NEXT:    s_and_b32 s14, s55, 0xff
+; SI-NEXT:    s_lshl_b32 s15, s54, 8
+; SI-NEXT:    s_or_b32 s15, s14, s15
+; SI-NEXT:    s_and_b32 s14, s52, 0xff
+; SI-NEXT:    s_lshl_b32 s14, s14, 16
+; SI-NEXT:    s_lshl_b32 s24, s53, 24
+; SI-NEXT:    s_mov_b32 s28, s90
+; SI-NEXT:    s_or_b32 s90, s24, s14
+; SI-NEXT:    s_and_b32 s14, s64, 0xff
+; SI-NEXT:    s_lshl_b32 s14, s14, 16
+; SI-NEXT:    s_lshl_b32 s25, s65, 24
+; SI-NEXT:    s_or_b32 s14, s25, s14
+; SI-NEXT:    s_and_b32 s25, s34, 0xff
+; SI-NEXT:    s_lshl_b32 s40, s86, 8
+; SI-NEXT:    s_or_b32 s41, s25, s40
+; SI-NEXT:    s_and_b32 s25, s80, 0xff
+; SI-NEXT:    s_lshl_b32 s25, s25, 16
+; SI-NEXT:    s_lshl_b32 s40, s81, 24
+; SI-NEXT:    s_or_b32 s18, s40, s25
+; SI-NEXT:    s_and_b32 s40, s31, 0xff
+; SI-NEXT:    s_lshl_b32 s40, s40, 16
+; SI-NEXT:    s_lshl_b32 s42, s84, 24
+; SI-NEXT:    s_or_b32 s40, s42, s40
+; SI-NEXT:    s_and_b32 s42, s35, 0xff
+; SI-NEXT:    s_lshl_b32 s43, s97, 8
+; SI-NEXT:    s_or_b32 s43, s42, s43
+; SI-NEXT:    s_and_b32 s42, s71, 0xff
+; SI-NEXT:    s_lshl_b32 s42, s42, 16
+; SI-NEXT:    s_lshl_b32 s76, s67, 24
+; SI-NEXT:    s_or_b32 s35, s76, s42
+; SI-NEXT:    s_and_b32 s42, s87, 0xff
+; SI-NEXT:    s_lshl_b32 s42, s42, 16
+; SI-NEXT:    s_lshl_b32 s76, s83, 24
+; SI-NEXT:    s_or_b32 s42, s76, s42
+; SI-NEXT:    s_and_b32 s76, s19, 0xff
+; SI-NEXT:    s_lshl_b32 s77, s20, 8
+; SI-NEXT:    s_or_b32 s76, s76, s77
+; SI-NEXT:    s_and_b32 s77, s66, 0xff
+; SI-NEXT:    v_writelane_b32 v42, s78, 52
+; SI-NEXT:    s_lshl_b32 s77, s77, 16
+; SI-NEXT:    s_lshl_b32 s78, s85, 24
+; SI-NEXT:    s_or_b32 s19, s78, s77
+; SI-NEXT:    s_and_b32 s77, s94, 0xff
+; SI-NEXT:    s_lshl_b32 s77, s77, 16
+; SI-NEXT:    s_lshl_b32 s78, s27, 24
+; SI-NEXT:    s_and_b32 s76, s76, 0xffff
+; SI-NEXT:    s_or_b32 vcc_lo, s78, s77
+; SI-NEXT:    s_or_b32 vcc_hi, s76, s19
+; SI-NEXT:    s_and_b32 s76, s26, 0xff
+; SI-NEXT:    s_lshl_b32 s77, s23, 8
+; SI-NEXT:    s_or_b32 s76, s76, s77
+; SI-NEXT:    s_and_b32 s77, s98, 0xff
+; SI-NEXT:    s_lshl_b32 s77, s77, 16
+; SI-NEXT:    s_lshl_b32 s78, s17, 24
+; SI-NEXT:    s_or_b32 s71, s78, s77
+; SI-NEXT:    s_and_b32 s77, s79, 0xff
+; SI-NEXT:    s_and_b32 s76, s76, 0xffff
+; SI-NEXT:    v_readlane_b32 s17, v43, 40
+; SI-NEXT:    s_and_b32 s41, s41, 0xffff
+; SI-NEXT:    s_lshl_b32 s77, s77, 16
+; SI-NEXT:    s_lshl_b32 s78, s88, 24
+; SI-NEXT:    s_or_b32 s39, s76, s71
+; SI-NEXT:    s_and_b32 s76, s17, 0xff
+; SI-NEXT:    v_readlane_b32 s17, v43, 39
+; SI-NEXT:    s_or_b32 s41, s41, s18
+; SI-NEXT:    s_mov_b32 s31, s18
+; SI-NEXT:    s_or_b32 s38, s78, s77
+; SI-NEXT:    s_lshl_b32 s77, s17, 8
+; SI-NEXT:    v_readlane_b32 s18, v43, 38
+; SI-NEXT:    s_or_b32 s76, s76, s77
+; SI-NEXT:    s_and_b32 s77, s18, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v43, 37
+; SI-NEXT:    s_lshl_b32 s77, s77, 16
+; SI-NEXT:    s_lshl_b32 s78, s18, 24
+; SI-NEXT:    s_or_b32 s80, s78, s77
+; SI-NEXT:    s_and_b32 s77, s95, 0xff
+; SI-NEXT:    s_and_b32 s76, s76, 0xffff
+; SI-NEXT:    v_readlane_b32 s17, v43, 36
+; SI-NEXT:    s_lshl_b32 s77, s77, 16
+; SI-NEXT:    s_lshl_b32 s78, s93, 24
+; SI-NEXT:    s_or_b32 s49, s76, s80
+; SI-NEXT:    s_and_b32 s76, s17, 0xff
+; SI-NEXT:    v_readlane_b32 s17, v43, 35
+; SI-NEXT:    s_or_b32 s48, s78, s77
+; SI-NEXT:    s_lshl_b32 s77, s17, 8
+; SI-NEXT:    v_readlane_b32 s17, v43, 34
+; SI-NEXT:    s_or_b32 s76, s76, s77
+; SI-NEXT:    s_and_b32 s77, s17, 0xff
+; SI-NEXT:    v_readlane_b32 s17, v43, 33
+; SI-NEXT:    s_lshl_b32 s77, s77, 16
+; SI-NEXT:    s_lshl_b32 s78, s17, 24
+; SI-NEXT:    s_or_b32 s81, s78, s77
+; SI-NEXT:    s_and_b32 s77, s30, 0xff
+; SI-NEXT:    s_lshl_b32 s77, s77, 16
+; SI-NEXT:    s_lshl_b32 s78, s69, 24
+; SI-NEXT:    s_and_b32 s76, s76, 0xffff
+; SI-NEXT:    v_readlane_b32 s17, v43, 31
+; SI-NEXT:    s_or_b32 s50, s78, s77
+; SI-NEXT:    s_or_b32 s51, s76, s81
+; SI-NEXT:    s_and_b32 s76, s17, 0xff
+; SI-NEXT:    s_lshl_b32 s77, s96, 8
+; SI-NEXT:    v_readlane_b32 s17, v43, 30
+; SI-NEXT:    s_or_b32 s76, s76, s77
+; SI-NEXT:    s_and_b32 s77, s17, 0xff
+; SI-NEXT:    s_lshl_b32 s77, s77, 16
+; SI-NEXT:    s_lshl_b32 s78, s82, 24
+; SI-NEXT:    v_writelane_b32 v42, s96, 53
+; SI-NEXT:    v_readlane_b32 s18, v43, 32
+; SI-NEXT:    v_writelane_b32 v42, s82, 54
+; SI-NEXT:    s_or_b32 s82, s78, s77
+; SI-NEXT:    s_and_b32 s77, s18, 0xff
+; SI-NEXT:    s_and_b32 s76, s76, 0xffff
+; SI-NEXT:    v_readlane_b32 s17, v43, 28
+; SI-NEXT:    s_lshl_b32 s77, s77, 16
+; SI-NEXT:    s_lshl_b32 s78, s70, 24
+; SI-NEXT:    s_or_b32 s53, s76, s82
+; SI-NEXT:    s_and_b32 s76, s17, 0xff
+; SI-NEXT:    v_readlane_b32 s17, v43, 27
+; SI-NEXT:    s_or_b32 s52, s78, s77
+; SI-NEXT:    s_lshl_b32 s77, s17, 8
+; SI-NEXT:    v_readlane_b32 s18, v43, 26
+; SI-NEXT:    s_or_b32 s76, s76, s77
+; SI-NEXT:    s_and_b32 s77, s18, 0xff
+; SI-NEXT:    v_readlane_b32 s17, v43, 25
+; SI-NEXT:    s_lshl_b32 s77, s77, 16
+; SI-NEXT:    s_lshl_b32 s78, s17, 24
+; SI-NEXT:    v_writelane_b32 v42, s16, 55
+; SI-NEXT:    s_or_b32 s16, s78, s77
+; SI-NEXT:    s_and_b32 s77, s89, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v43, 29
+; SI-NEXT:    s_lshl_b32 s77, s77, 16
+; SI-NEXT:    s_lshl_b32 s78, s18, 24
+; SI-NEXT:    s_and_b32 s76, s76, 0xffff
+; SI-NEXT:    v_readlane_b32 s17, v43, 22
+; SI-NEXT:    v_readlane_b32 s18, v43, 21
+; SI-NEXT:    s_or_b32 s54, s78, s77
+; SI-NEXT:    s_or_b32 s55, s76, s16
+; SI-NEXT:    s_and_b32 s76, s17, 0xff
+; SI-NEXT:    s_lshl_b32 s77, s18, 8
+; SI-NEXT:    v_readlane_b32 s17, v43, 20
+; SI-NEXT:    s_or_b32 s76, s76, s77
+; SI-NEXT:    s_and_b32 s77, s17, 0xff
+; SI-NEXT:    v_readlane_b32 s17, v43, 19
+; SI-NEXT:    s_lshl_b32 s77, s77, 16
+; SI-NEXT:    s_lshl_b32 s78, s17, 24
+; SI-NEXT:    v_readlane_b32 s17, v43, 24
+; SI-NEXT:    s_or_b32 s83, s78, s77
+; SI-NEXT:    s_and_b32 s77, s17, 0xff
+; SI-NEXT:    v_readlane_b32 s17, v43, 23
+; SI-NEXT:    s_lshl_b32 s77, s77, 16
+; SI-NEXT:    s_lshl_b32 s78, s17, 24
+; SI-NEXT:    s_and_b32 s76, s76, 0xffff
+; SI-NEXT:    v_readlane_b32 s17, v43, 17
+; SI-NEXT:    v_readlane_b32 s18, v43, 16
+; SI-NEXT:    s_or_b32 s64, s78, s77
+; SI-NEXT:    s_or_b32 s65, s76, s83
+; SI-NEXT:    s_and_b32 s76, s17, 0xff
+; SI-NEXT:    s_lshl_b32 s77, s18, 8
+; SI-NEXT:    v_readlane_b32 s18, v43, 15
+; SI-NEXT:    s_or_b32 s76, s76, s77
+; SI-NEXT:    s_and_b32 s77, s18, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v43, 14
+; SI-NEXT:    v_writelane_b32 v42, s89, 56
+; SI-NEXT:    s_lshl_b32 s77, s77, 16
+; SI-NEXT:    s_lshl_b32 s78, s18, 24
+; SI-NEXT:    v_writelane_b32 v42, s70, 57
+; SI-NEXT:    s_or_b32 s85, s78, s77
+; SI-NEXT:    s_and_b32 s77, s44, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v43, 18
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_and_b32 s9, s9, 0xffff
+; SI-NEXT:    s_and_b32 s11, s11, 0xffff
+; SI-NEXT:    s_and_b32 s13, s13, 0xffff
+; SI-NEXT:    s_and_b32 s15, s15, 0xffff
+; SI-NEXT:    s_and_b32 s43, s43, 0xffff
+; SI-NEXT:    v_writelane_b32 v42, s69, 58
+; SI-NEXT:    s_lshl_b32 s77, s77, 16
+; SI-NEXT:    s_lshl_b32 s78, s18, 24
+; SI-NEXT:    s_and_b32 s76, s76, 0xffff
+; SI-NEXT:    s_and_b32 s44, s29, 0xffff
+; SI-NEXT:    s_or_b32 s7, s7, s37
+; SI-NEXT:    s_or_b32 s9, s9, s68
+; SI-NEXT:    s_or_b32 s11, s11, s99
+; SI-NEXT:    s_or_b32 s13, s13, s92
+; SI-NEXT:    s_or_b32 s15, s15, s90
+; SI-NEXT:    s_or_b32 s43, s43, s35
+; SI-NEXT:    v_writelane_b32 v42, s30, 59
+; SI-NEXT:    s_mov_b32 s23, s91
+; SI-NEXT:    s_mov_b32 s91, s36
+; SI-NEXT:    s_or_b32 s66, s78, s77
+; SI-NEXT:    s_or_b32 s67, s76, s85
+; SI-NEXT:    s_and_b32 s45, s45, 0xffff
+; SI-NEXT:    s_and_b32 s46, s46, 0xffff
+; SI-NEXT:    s_and_b32 s47, s47, 0xffff
+; SI-NEXT:    s_and_b32 s56, s56, 0xffff
+; SI-NEXT:    s_and_b32 s57, s57, 0xffff
+; SI-NEXT:    s_and_b32 s30, s58, 0xffff
+; SI-NEXT:    s_and_b32 s34, s59, 0xffff
+; SI-NEXT:    s_and_b32 s36, s60, 0xffff
+; SI-NEXT:    s_and_b32 s97, s61, 0xffff
+; SI-NEXT:    s_and_b32 s86, s62, 0xffff
+; SI-NEXT:    s_and_b32 s98, s63, 0xffff
+; SI-NEXT:    s_and_b32 s17, s72, 0xffff
+; SI-NEXT:    s_and_b32 s87, s73, 0xffff
+; SI-NEXT:    s_and_b32 s96, s74, 0xffff
+; SI-NEXT:    s_and_b32 s22, s75, 0xffff
+; SI-NEXT:    s_or_b32 s74, s44, s4
+; SI-NEXT:    s_mov_b32 s75, s5
+; SI-NEXT:    s_lshr_b64 s[76:77], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[40:41], 16
+; SI-NEXT:    s_mov_b32 s70, s93
+; SI-NEXT:    s_mov_b32 s69, s95
+; SI-NEXT:    s_mov_b32 s93, s28
+; SI-NEXT:    s_or_b32 s72, s45, s6
+; SI-NEXT:    s_mov_b32 s73, s7
+; SI-NEXT:    s_lshr_b64 s[26:27], s[6:7], 16
+; SI-NEXT:    s_or_b32 s62, s46, s8
+; SI-NEXT:    s_mov_b32 s63, s9
+; SI-NEXT:    s_lshr_b64 s[28:29], s[8:9], 16
+; SI-NEXT:    s_or_b32 s60, s47, s10
+; SI-NEXT:    s_mov_b32 s61, s11
+; SI-NEXT:    s_lshr_b64 s[88:89], s[10:11], 16
+; SI-NEXT:    s_or_b32 s58, s56, s12
+; SI-NEXT:    s_mov_b32 s59, s13
+; SI-NEXT:    s_lshr_b64 s[20:21], s[12:13], 16
+; SI-NEXT:    s_or_b32 s56, s57, s14
+; SI-NEXT:    s_mov_b32 s57, s15
+; SI-NEXT:    s_lshr_b64 s[24:25], s[14:15], 16
+; SI-NEXT:    s_or_b32 s46, s30, s40
+; SI-NEXT:    s_mov_b32 s47, s41
+; SI-NEXT:    s_or_b32 s44, s34, s42
+; SI-NEXT:    s_mov_b32 s34, s4
+; SI-NEXT:    s_mov_b32 s45, s43
+; SI-NEXT:    s_lshr_b64 s[94:95], s[42:43], 16
+; SI-NEXT:    s_or_b32 s42, s36, vcc_lo
+; SI-NEXT:    s_mov_b32 s43, vcc_hi
+; SI-NEXT:    s_lshr_b64 vcc, vcc, 16
+; SI-NEXT:    s_or_b32 s40, s97, s38
+; SI-NEXT:    s_mov_b32 s41, s39
+; SI-NEXT:    s_lshr_b64 s[38:39], s[38:39], 16
+; SI-NEXT:    s_or_b32 s14, s86, s48
+; SI-NEXT:    s_mov_b32 s15, s49
+; SI-NEXT:    s_lshr_b64 s[48:49], s[48:49], 16
+; SI-NEXT:    s_or_b32 s12, s98, s50
+; SI-NEXT:    s_mov_b32 s13, s51
+; SI-NEXT:    s_lshr_b64 s[50:51], s[50:51], 16
+; SI-NEXT:    s_or_b32 s10, s17, s52
+; SI-NEXT:    s_mov_b32 s11, s53
+; SI-NEXT:    s_lshr_b64 s[52:53], s[52:53], 16
+; SI-NEXT:    s_or_b32 s8, s87, s54
+; SI-NEXT:    s_mov_b32 s9, s55
+; SI-NEXT:    s_lshr_b64 s[54:55], s[54:55], 16
+; SI-NEXT:    s_or_b32 s6, s96, s64
+; SI-NEXT:    s_mov_b32 s7, s65
+; SI-NEXT:    s_lshr_b64 s[64:65], s[64:65], 16
+; SI-NEXT:    s_or_b32 s4, s22, s66
+; SI-NEXT:    s_mov_b32 s5, s67
+; SI-NEXT:    s_lshr_b64 s[66:67], s[66:67], 16
+; SI-NEXT:    v_readlane_b32 s17, v42, 51
+; SI-NEXT:    s_lshr_b32 s55, s17, 16
+; SI-NEXT:    s_lshr_b32 s53, s37, 16
+; SI-NEXT:    s_lshr_b32 s51, s68, 16
+; SI-NEXT:    s_lshr_b32 s49, s99, 16
+; SI-NEXT:    s_lshr_b32 s86, s92, 16
+; SI-NEXT:    s_lshr_b32 s39, s90, 16
+; SI-NEXT:    s_lshr_b32 s18, s31, 16
+; SI-NEXT:    s_lshr_b32 s22, s35, 16
+; SI-NEXT:    s_lshr_b32 s97, s19, 16
+; SI-NEXT:    s_lshr_b32 s65, s71, 16
+; SI-NEXT:    s_lshr_b32 s19, s80, 16
+; SI-NEXT:    s_lshr_b32 s71, s81, 16
+; SI-NEXT:    s_lshr_b32 s67, s82, 16
+; SI-NEXT:    v_readlane_b32 s82, v42, 54
+; SI-NEXT:    v_readlane_b32 s96, v42, 53
+; SI-NEXT:    s_lshr_b32 s80, s16, 16
+; SI-NEXT:    v_readlane_b32 s16, v42, 55
+; SI-NEXT:    s_lshr_b32 s81, s83, 16
+; SI-NEXT:    s_mov_b32 s90, s93
+; SI-NEXT:    v_readlane_b32 s78, v42, 52
+; SI-NEXT:    s_mov_b32 s95, s69
+; SI-NEXT:    s_mov_b32 s93, s70
+; SI-NEXT:    v_readlane_b32 s30, v42, 59
+; SI-NEXT:    v_readlane_b32 s69, v42, 58
+; SI-NEXT:    v_readlane_b32 s70, v42, 57
+; SI-NEXT:    v_readlane_b32 s89, v42, 56
+; SI-NEXT:    s_lshr_b32 s77, s85, 16
+; SI-NEXT:    s_mov_b32 s84, vcc_lo
+; SI-NEXT:    s_mov_b32 s36, s91
+; SI-NEXT:    s_mov_b32 s91, s23
+; SI-NEXT:    s_cbranch_execnz .LBB97_3
+; SI-NEXT:  .LBB97_2: ; %cmp.true
+; SI-NEXT:    v_readlane_b32 s4, v43, 42
 ; SI-NEXT:    s_add_i32 s4, s4, 3
-; SI-NEXT:    v_readlane_b32 s5, v62, 17
-; SI-NEXT:    v_readlane_b32 s6, v62, 6
+; SI-NEXT:    v_readlane_b32 s6, v43, 41
 ; SI-NEXT:    s_and_b32 s4, s4, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 8
+; SI-NEXT:    s_lshl_b32 s5, s90, 8
 ; SI-NEXT:    s_add_i32 s6, s6, 3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    v_readlane_b32 s5, v43, 18
 ; SI-NEXT:    s_and_b32 s6, s6, 0xff
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s19, 24
+; SI-NEXT:    s_lshl_b32 s5, s5, 24
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; SI-NEXT:    s_addk_i32 s4, 0x300
 ; SI-NEXT:    s_or_b32 s5, s5, s6
+; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s18, s4, 0x3000000
-; SI-NEXT:    s_add_i32 s4, s20, 3
-; SI-NEXT:    v_readlane_b32 s5, v62, 5
-; SI-NEXT:    v_readlane_b32 s6, v62, 4
-; SI-NEXT:    s_and_b32 s4, s4, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 8
+; SI-NEXT:    v_readlane_b32 s5, v43, 17
+; SI-NEXT:    s_add_i32 s5, s5, 3
+; SI-NEXT:    v_readlane_b32 s6, v43, 16
+; SI-NEXT:    v_readlane_b32 s7, v43, 15
+; SI-NEXT:    s_and_b32 s5, s5, 0xff
+; SI-NEXT:    s_lshl_b32 s6, s6, 8
+; SI-NEXT:    s_add_i32 s7, s7, 3
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_readlane_b32 s6, v43, 14
+; SI-NEXT:    s_and_b32 s7, s7, 0xff
+; SI-NEXT:    s_lshl_b32 s6, s6, 24
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_addk_i32 s5, 0x300
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_readlane_b32 s6, v43, 44
 ; SI-NEXT:    s_add_i32 s6, s6, 3
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_readlane_b32 s5, v62, 3
+; SI-NEXT:    v_readlane_b32 s8, v43, 24
 ; SI-NEXT:    s_and_b32 s6, s6, 0xff
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s5, 24
-; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s6
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s42, s4, 0x3000000
-; SI-NEXT:    v_mov_b32_e32 v13, s18
-; SI-NEXT:    v_mov_b32_e32 v22, s8
-; SI-NEXT:    v_mov_b32_e32 v20, s10
-; SI-NEXT:    v_mov_b32_e32 v19, s12
-; SI-NEXT:    v_mov_b32_e32 v18, s13
-; SI-NEXT:    v_add_i32_e32 v45, vcc, 0x3000000, v5
-; SI-NEXT:    v_mov_b32_e32 v5, s14
-; SI-NEXT:    v_add_i32_e32 v47, vcc, 0x3000000, v6
-; SI-NEXT:    v_mov_b32_e32 v6, s15
-; SI-NEXT:    v_alignbit_b32 v57, s42, v13, 16
-; SI-NEXT:    v_mov_b32_e32 v13, s17
-; SI-NEXT:    v_alignbit_b32 v58, s40, v13, 16
-; SI-NEXT:    v_alignbit_b32 v56, v6, v47, 16
-; SI-NEXT:    v_alignbit_b32 v51, v5, v45, 16
-; SI-NEXT:    v_alignbit_b32 v1, v18, v17, 16
-; SI-NEXT:    v_alignbit_b32 v3, v19, v16, 16
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_alignbit_b32 v44, v20, v15, 16
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v43, v22, v12, 16
-; SI-NEXT:    v_alignbit_b32 v42, v23, v10, 16
-; SI-NEXT:    v_alignbit_b32 v41, v24, v9, 16
-; SI-NEXT:    v_alignbit_b32 v40, v25, v8, 16
-; SI-NEXT:    v_alignbit_b32 v55, v26, v7, 16
-; SI-NEXT:    v_alignbit_b32 v54, v27, v4, 16
-; SI-NEXT:    v_alignbit_b32 v53, v28, v2, 16
-; SI-NEXT:    v_alignbit_b32 v52, v39, v31, 16
-; SI-NEXT:    v_alignbit_b32 v28, v30, v49, 16
-; SI-NEXT:    s_lshr_b32 s73, s42, 16
-; SI-NEXT:    s_lshr_b32 s72, s40, 16
-; SI-NEXT:    s_lshr_b32 s63, s15, 16
-; SI-NEXT:    s_lshr_b32 s62, s14, 16
-; SI-NEXT:    s_lshr_b32 s61, s13, 16
-; SI-NEXT:    s_lshr_b32 s60, s12, 16
-; SI-NEXT:    s_lshr_b32 s59, s10, 16
-; SI-NEXT:    s_lshr_b32 s58, s8, 16
-; SI-NEXT:    s_lshr_b32 s57, s36, 16
-; SI-NEXT:    s_lshr_b32 s56, s53, 16
-; SI-NEXT:    s_lshr_b32 s47, s94, 16
-; SI-NEXT:    s_lshr_b32 s46, s49, 16
-; SI-NEXT:    s_lshr_b32 s45, s48, 16
-; SI-NEXT:    s_lshr_b32 s44, s11, 16
-; SI-NEXT:    s_lshr_b32 s43, s9, 16
-; SI-NEXT:    s_lshr_b32 s41, s16, 16
+; SI-NEXT:    s_lshl_b32 s7, s91, 8
+; SI-NEXT:    s_add_i32 s8, s8, 3
+; SI-NEXT:    s_or_b32 s6, s7, s6
+; SI-NEXT:    v_readlane_b32 s7, v43, 23
+; SI-NEXT:    s_and_b32 s8, s8, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 24
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_addk_i32 s6, 0x300
+; SI-NEXT:    s_or_b32 s7, s7, s8
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_or_b32 s6, s7, s6
+; SI-NEXT:    v_readlane_b32 s7, v43, 22
+; SI-NEXT:    s_add_i32 s7, s7, 3
+; SI-NEXT:    v_readlane_b32 s8, v43, 21
+; SI-NEXT:    v_readlane_b32 s9, v43, 20
+; SI-NEXT:    s_and_b32 s7, s7, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s8, 8
+; SI-NEXT:    s_add_i32 s9, s9, 3
+; SI-NEXT:    s_or_b32 s7, s8, s7
+; SI-NEXT:    v_readlane_b32 s8, v43, 19
+; SI-NEXT:    s_and_b32 s9, s9, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s8, 24
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_addk_i32 s7, 0x300
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s7, s8, s7
+; SI-NEXT:    v_readlane_b32 s8, v43, 43
+; SI-NEXT:    s_add_i32 s8, s8, 3
+; SI-NEXT:    s_and_b32 s8, s8, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s78, 8
+; SI-NEXT:    s_add_i32 s10, s89, 3
+; SI-NEXT:    s_or_b32 s8, s9, s8
+; SI-NEXT:    v_readlane_b32 s9, v43, 29
+; SI-NEXT:    s_and_b32 s10, s10, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s9, 24
+; SI-NEXT:    s_lshl_b32 s10, s10, 16
+; SI-NEXT:    s_addk_i32 s8, 0x300
+; SI-NEXT:    s_or_b32 s9, s9, s10
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    s_or_b32 s8, s9, s8
+; SI-NEXT:    v_readlane_b32 s9, v43, 28
+; SI-NEXT:    s_add_i32 s9, s9, 3
+; SI-NEXT:    v_readlane_b32 s10, v43, 27
+; SI-NEXT:    v_readlane_b32 s11, v43, 26
+; SI-NEXT:    s_and_b32 s9, s9, 0xff
+; SI-NEXT:    s_lshl_b32 s10, s10, 8
+; SI-NEXT:    s_add_i32 s11, s11, 3
+; SI-NEXT:    s_or_b32 s9, s10, s9
+; SI-NEXT:    v_readlane_b32 s10, v43, 25
+; SI-NEXT:    s_and_b32 s11, s11, 0xff
+; SI-NEXT:    s_lshl_b32 s10, s10, 24
+; SI-NEXT:    s_lshl_b32 s11, s11, 16
+; SI-NEXT:    s_addk_i32 s9, 0x300
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    s_and_b32 s9, s9, 0xffff
+; SI-NEXT:    s_or_b32 s9, s10, s9
+; SI-NEXT:    v_readlane_b32 s10, v43, 46
+; SI-NEXT:    s_add_i32 s10, s10, 3
+; SI-NEXT:    v_readlane_b32 s11, v43, 45
+; SI-NEXT:    v_readlane_b32 s12, v43, 32
+; SI-NEXT:    s_and_b32 s10, s10, 0xff
+; SI-NEXT:    s_lshl_b32 s11, s11, 8
+; SI-NEXT:    s_add_i32 s12, s12, 3
+; SI-NEXT:    s_or_b32 s10, s11, s10
+; SI-NEXT:    s_and_b32 s12, s12, 0xff
+; SI-NEXT:    s_lshl_b32 s11, s70, 24
+; SI-NEXT:    s_lshl_b32 s12, s12, 16
+; SI-NEXT:    s_addk_i32 s10, 0x300
+; SI-NEXT:    s_or_b32 s11, s11, s12
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_or_b32 s10, s11, s10
+; SI-NEXT:    v_readlane_b32 s11, v43, 31
+; SI-NEXT:    s_add_i32 s11, s11, 3
+; SI-NEXT:    v_readlane_b32 s13, v43, 30
+; SI-NEXT:    s_and_b32 s11, s11, 0xff
+; SI-NEXT:    s_lshl_b32 s12, s96, 8
+; SI-NEXT:    s_add_i32 s13, s13, 3
+; SI-NEXT:    s_or_b32 s11, s12, s11
+; SI-NEXT:    s_and_b32 s13, s13, 0xff
+; SI-NEXT:    s_lshl_b32 s12, s82, 24
+; SI-NEXT:    s_lshl_b32 s13, s13, 16
+; SI-NEXT:    s_addk_i32 s11, 0x300
+; SI-NEXT:    s_or_b32 s12, s12, s13
+; SI-NEXT:    s_and_b32 s11, s11, 0xffff
+; SI-NEXT:    s_or_b32 s11, s12, s11
+; SI-NEXT:    s_add_i32 s12, s36, 3
+; SI-NEXT:    s_and_b32 s12, s12, 0xff
+; SI-NEXT:    s_lshl_b32 s13, s16, 8
+; SI-NEXT:    s_add_i32 s14, s30, 3
+; SI-NEXT:    s_or_b32 s12, s13, s12
+; SI-NEXT:    s_and_b32 s14, s14, 0xff
+; SI-NEXT:    s_lshl_b32 s13, s69, 24
+; SI-NEXT:    s_lshl_b32 s14, s14, 16
+; SI-NEXT:    s_addk_i32 s12, 0x300
+; SI-NEXT:    s_or_b32 s13, s13, s14
+; SI-NEXT:    s_and_b32 s12, s12, 0xffff
+; SI-NEXT:    s_or_b32 s12, s13, s12
+; SI-NEXT:    v_readlane_b32 s13, v43, 36
+; SI-NEXT:    s_add_i32 s13, s13, 3
+; SI-NEXT:    v_readlane_b32 s14, v43, 35
+; SI-NEXT:    v_readlane_b32 s15, v43, 34
+; SI-NEXT:    s_and_b32 s13, s13, 0xff
+; SI-NEXT:    s_lshl_b32 s14, s14, 8
+; SI-NEXT:    s_add_i32 s15, s15, 3
+; SI-NEXT:    s_or_b32 s13, s14, s13
+; SI-NEXT:    v_readlane_b32 s14, v43, 33
+; SI-NEXT:    s_and_b32 s15, s15, 0xff
+; SI-NEXT:    s_lshl_b32 s14, s14, 24
+; SI-NEXT:    s_lshl_b32 s15, s15, 16
+; SI-NEXT:    s_addk_i32 s13, 0x300
+; SI-NEXT:    s_or_b32 s14, s14, s15
+; SI-NEXT:    s_and_b32 s13, s13, 0xffff
+; SI-NEXT:    s_or_b32 s13, s14, s13
+; SI-NEXT:    v_readlane_b32 s14, v42, 50
+; SI-NEXT:    s_add_i32 s17, s14, 3
+; SI-NEXT:    v_readlane_b32 s15, v42, 49
+; SI-NEXT:    s_and_b32 s14, s17, 0xff
+; SI-NEXT:    s_lshl_b32 s15, s15, 8
+; SI-NEXT:    s_add_i32 s16, s95, 3
+; SI-NEXT:    s_or_b32 s14, s15, s14
+; SI-NEXT:    s_and_b32 s16, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s15, s93, 24
+; SI-NEXT:    s_lshl_b32 s16, s16, 16
+; SI-NEXT:    s_addk_i32 s14, 0x300
+; SI-NEXT:    s_or_b32 s15, s15, s16
+; SI-NEXT:    s_and_b32 s14, s14, 0xffff
+; SI-NEXT:    s_or_b32 s14, s15, s14
+; SI-NEXT:    v_readlane_b32 s15, v43, 40
+; SI-NEXT:    s_add_i32 s15, s15, 3
+; SI-NEXT:    v_readlane_b32 s16, v43, 39
+; SI-NEXT:    v_readlane_b32 s17, v43, 38
+; SI-NEXT:    s_and_b32 s15, s15, 0xff
+; SI-NEXT:    s_lshl_b32 s16, s16, 8
+; SI-NEXT:    s_add_i32 s17, s17, 3
+; SI-NEXT:    s_or_b32 s15, s16, s15
+; SI-NEXT:    v_readlane_b32 s16, v43, 37
+; SI-NEXT:    s_and_b32 s17, s17, 0xff
+; SI-NEXT:    s_lshl_b32 s16, s16, 24
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_addk_i32 s15, 0x300
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    s_and_b32 s15, s15, 0xffff
+; SI-NEXT:    s_or_b32 s15, s16, s15
+; SI-NEXT:    v_readlane_b32 s16, v42, 48
+; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    v_readlane_b32 s17, v42, 47
+; SI-NEXT:    v_readlane_b32 s18, v42, 42
+; SI-NEXT:    s_and_b32 s16, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_add_i32 s99, s18, 3
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_readlane_b32 s17, v42, 43
+; SI-NEXT:    s_and_b32 s18, s99, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 24
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_addk_i32 s16, 0x300
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_readlane_b32 s17, v42, 38
+; SI-NEXT:    s_add_i32 s87, s17, 3
+; SI-NEXT:    v_readlane_b32 s18, v42, 35
+; SI-NEXT:    v_readlane_b32 s19, v42, 29
+; SI-NEXT:    s_and_b32 s17, s87, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s18, 8
+; SI-NEXT:    s_add_i32 s23, s19, 3
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    v_readlane_b32 s18, v42, 30
+; SI-NEXT:    s_and_b32 s23, s23, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s18, 24
+; SI-NEXT:    s_lshl_b32 s23, s23, 16
+; SI-NEXT:    s_addk_i32 s17, 0x300
+; SI-NEXT:    s_or_b32 s18, s18, s23
+; SI-NEXT:    s_and_b32 s17, s17, 0xffff
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_add_i32 s40, s16, 0x3000000
+; SI-NEXT:    v_readlane_b32 s16, v42, 45
+; SI-NEXT:    s_add_i32 s41, s17, 0x3000000
+; SI-NEXT:    s_add_i32 s68, s16, 3
+; SI-NEXT:    v_readlane_b32 s17, v42, 44
+; SI-NEXT:    v_readlane_b32 s18, v42, 39
+; SI-NEXT:    s_and_b32 s16, s68, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_add_i32 s96, s18, 3
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_readlane_b32 s17, v42, 40
+; SI-NEXT:    s_and_b32 s18, s96, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 24
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_addk_i32 s16, 0x300
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_readlane_b32 s17, v42, 33
+; SI-NEXT:    s_add_i32 s17, s17, 3
+; SI-NEXT:    v_readlane_b32 s18, v42, 31
+; SI-NEXT:    s_and_b32 s17, s17, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s18, 8
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    v_readlane_b32 s18, v42, 24
+; SI-NEXT:    s_addk_i32 s17, 0x300
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_add_i32 s42, s16, 0x3000000
+; SI-NEXT:    s_and_b32 s16, s17, 0xffff
+; SI-NEXT:    v_readlane_b32 s17, v42, 25
+; SI-NEXT:    s_and_b32 s18, s18, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 24
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s43, s16, 0x3000000
+; SI-NEXT:    v_readlane_b32 s16, v42, 46
+; SI-NEXT:    s_add_i32 s23, s16, 3
+; SI-NEXT:    v_readlane_b32 s17, v42, 41
+; SI-NEXT:    v_readlane_b32 s18, v42, 37
+; SI-NEXT:    s_and_b32 s16, s23, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_add_i32 s86, s18, 3
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_readlane_b32 s17, v42, 36
+; SI-NEXT:    s_and_b32 s18, s86, 0xff
+; SI-NEXT:    s_addk_i32 s16, 0x300
+; SI-NEXT:    s_lshl_b32 s17, s17, 24
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s44, s16, 0x3000000
+; SI-NEXT:    v_readlane_b32 s16, v42, 28
+; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    v_readlane_b32 s17, v42, 22
+; SI-NEXT:    v_readlane_b32 s18, v42, 16
+; SI-NEXT:    s_and_b32 s16, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_readlane_b32 s17, v42, 15
+; SI-NEXT:    s_and_b32 s18, s18, 0xff
+; SI-NEXT:    s_addk_i32 s16, 0x300
+; SI-NEXT:    s_lshl_b32 s17, s17, 24
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s45, s16, 0x3000000
+; SI-NEXT:    v_readlane_b32 s16, v42, 34
+; SI-NEXT:    s_add_i32 s83, s16, 3
+; SI-NEXT:    v_readlane_b32 s17, v42, 32
+; SI-NEXT:    v_readlane_b32 s18, v42, 26
+; SI-NEXT:    s_and_b32 s16, s83, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_readlane_b32 s17, v42, 27
+; SI-NEXT:    s_and_b32 s18, s18, 0xff
+; SI-NEXT:    s_addk_i32 s16, 0x300
+; SI-NEXT:    s_lshl_b32 s17, s17, 24
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s46, s16, 0x3000000
+; SI-NEXT:    v_readlane_b32 s16, v42, 23
+; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    v_readlane_b32 s17, v42, 21
+; SI-NEXT:    v_readlane_b32 s18, v42, 17
+; SI-NEXT:    s_and_b32 s16, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_readlane_b32 s17, v42, 18
+; SI-NEXT:    s_and_b32 s18, s18, 0xff
+; SI-NEXT:    s_addk_i32 s16, 0x300
+; SI-NEXT:    s_lshl_b32 s17, s17, 24
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s47, s16, 0x3000000
+; SI-NEXT:    v_readlane_b32 s16, v42, 20
+; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    v_readlane_b32 s17, v42, 19
+; SI-NEXT:    v_readlane_b32 s18, v42, 13
+; SI-NEXT:    s_and_b32 s16, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_readlane_b32 s17, v42, 14
+; SI-NEXT:    s_and_b32 s18, s18, 0xff
+; SI-NEXT:    s_addk_i32 s16, 0x300
+; SI-NEXT:    s_lshl_b32 s17, s17, 24
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s56, s16, 0x3000000
+; SI-NEXT:    v_readlane_b32 s16, v42, 12
+; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    v_readlane_b32 s17, v42, 11
+; SI-NEXT:    v_readlane_b32 s18, v42, 7
+; SI-NEXT:    s_and_b32 s16, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_readlane_b32 s17, v42, 8
+; SI-NEXT:    s_and_b32 s18, s18, 0xff
+; SI-NEXT:    s_addk_i32 s16, 0x300
+; SI-NEXT:    s_lshl_b32 s17, s17, 24
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s57, s16, 0x3000000
+; SI-NEXT:    v_readlane_b32 s16, v42, 10
+; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    v_readlane_b32 s17, v42, 9
+; SI-NEXT:    v_readlane_b32 s18, v42, 5
+; SI-NEXT:    s_and_b32 s16, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_readlane_b32 s17, v42, 6
+; SI-NEXT:    s_and_b32 s18, s18, 0xff
+; SI-NEXT:    s_addk_i32 s16, 0x300
+; SI-NEXT:    s_lshl_b32 s17, s17, 24
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s58, s16, 0x3000000
+; SI-NEXT:    v_readlane_b32 s16, v42, 4
+; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    v_readlane_b32 s17, v42, 3
+; SI-NEXT:    v_readlane_b32 s18, v43, 63
+; SI-NEXT:    s_and_b32 s16, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_readlane_b32 s17, v42, 0
+; SI-NEXT:    s_and_b32 s18, s18, 0xff
+; SI-NEXT:    s_addk_i32 s16, 0x300
+; SI-NEXT:    s_lshl_b32 s17, s17, 24
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s59, s16, 0x3000000
+; SI-NEXT:    v_readlane_b32 s16, v42, 2
+; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    v_readlane_b32 s17, v42, 1
+; SI-NEXT:    v_readlane_b32 s18, v43, 61
+; SI-NEXT:    s_and_b32 s16, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_readlane_b32 s17, v43, 62
+; SI-NEXT:    s_and_b32 s18, s18, 0xff
+; SI-NEXT:    s_addk_i32 s16, 0x300
+; SI-NEXT:    s_lshl_b32 s17, s17, 24
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s60, s16, 0x3000000
+; SI-NEXT:    v_readlane_b32 s16, v43, 60
+; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    v_readlane_b32 s17, v43, 59
+; SI-NEXT:    v_readlane_b32 s18, v43, 55
+; SI-NEXT:    s_and_b32 s16, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_readlane_b32 s17, v43, 56
+; SI-NEXT:    s_and_b32 s18, s18, 0xff
+; SI-NEXT:    s_addk_i32 s16, 0x300
+; SI-NEXT:    s_lshl_b32 s17, s17, 24
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s61, s16, 0x3000000
+; SI-NEXT:    v_readlane_b32 s16, v43, 58
+; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    v_readlane_b32 s17, v43, 57
+; SI-NEXT:    v_readlane_b32 s18, v43, 53
+; SI-NEXT:    s_and_b32 s16, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_readlane_b32 s17, v43, 54
+; SI-NEXT:    s_and_b32 s18, s18, 0xff
+; SI-NEXT:    s_addk_i32 s16, 0x300
+; SI-NEXT:    s_lshl_b32 s17, s17, 24
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s62, s16, 0x3000000
+; SI-NEXT:    v_readlane_b32 s16, v43, 52
+; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    v_readlane_b32 s17, v43, 51
+; SI-NEXT:    v_readlane_b32 s18, v43, 49
+; SI-NEXT:    s_and_b32 s16, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_readlane_b32 s17, v43, 50
+; SI-NEXT:    s_and_b32 s18, s18, 0xff
+; SI-NEXT:    s_addk_i32 s16, 0x300
+; SI-NEXT:    s_lshl_b32 s17, s17, 24
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s63, s16, 0x3000000
+; SI-NEXT:    v_readlane_b32 s16, v43, 5
+; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    v_readlane_b32 s17, v43, 4
+; SI-NEXT:    v_readlane_b32 s18, v43, 3
+; SI-NEXT:    s_and_b32 s16, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_readlane_b32 s17, v43, 2
+; SI-NEXT:    s_and_b32 s18, s18, 0xff
+; SI-NEXT:    s_addk_i32 s16, 0x300
+; SI-NEXT:    s_lshl_b32 s17, s17, 24
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s72, s16, 0x3000000
+; SI-NEXT:    v_readlane_b32 s16, v43, 1
+; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    v_readlane_b32 s17, v43, 0
+; SI-NEXT:    v_readlane_b32 s18, v43, 47
+; SI-NEXT:    s_and_b32 s16, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_readlane_b32 s17, v43, 48
+; SI-NEXT:    s_and_b32 s18, s18, 0xff
+; SI-NEXT:    s_addk_i32 s16, 0x300
+; SI-NEXT:    s_lshl_b32 s17, s17, 24
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s73, s16, 0x3000000
+; SI-NEXT:    v_readlane_b32 s16, v43, 13
+; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    v_readlane_b32 s17, v43, 12
+; SI-NEXT:    v_readlane_b32 s18, v43, 11
+; SI-NEXT:    s_and_b32 s16, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_readlane_b32 s17, v43, 10
+; SI-NEXT:    s_and_b32 s18, s18, 0xff
+; SI-NEXT:    s_addk_i32 s16, 0x300
+; SI-NEXT:    s_lshl_b32 s17, s17, 24
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s74, s16, 0x3000000
+; SI-NEXT:    v_readlane_b32 s16, v43, 9
+; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    v_readlane_b32 s17, v43, 8
+; SI-NEXT:    v_readlane_b32 s18, v43, 7
+; SI-NEXT:    s_and_b32 s16, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_readlane_b32 s17, v43, 6
+; SI-NEXT:    s_and_b32 s18, s18, 0xff
+; SI-NEXT:    s_addk_i32 s16, 0x300
+; SI-NEXT:    s_lshl_b32 s17, s17, 24
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s4, s4, 0x3000000
+; SI-NEXT:    s_add_i32 s5, s5, 0x3000000
+; SI-NEXT:    s_add_i32 s6, s6, 0x3000000
+; SI-NEXT:    s_add_i32 s7, s7, 0x3000000
+; SI-NEXT:    s_add_i32 s8, s8, 0x3000000
+; SI-NEXT:    s_add_i32 s9, s9, 0x3000000
+; SI-NEXT:    s_add_i32 s10, s10, 0x3000000
+; SI-NEXT:    s_add_i32 s11, s11, 0x3000000
+; SI-NEXT:    s_add_i32 s12, s12, 0x3000000
+; SI-NEXT:    s_add_i32 s13, s13, 0x3000000
+; SI-NEXT:    s_add_i32 s14, s14, 0x3000000
+; SI-NEXT:    s_add_i32 s15, s15, 0x3000000
+; SI-NEXT:    s_add_i32 s75, s16, 0x3000000
+; SI-NEXT:    s_lshr_b64 s[76:77], s[74:75], 16
+; SI-NEXT:    s_lshr_b64 s[38:39], s[40:41], 16
+; SI-NEXT:    s_lshr_b64 s[48:49], s[14:15], 16
+; SI-NEXT:    s_lshr_b64 s[50:51], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[52:53], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[54:55], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[64:65], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[66:67], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[26:27], s[72:73], 16
+; SI-NEXT:    s_lshr_b64 s[28:29], s[62:63], 16
+; SI-NEXT:    s_lshr_b64 s[88:89], s[60:61], 16
+; SI-NEXT:    s_lshr_b64 s[20:21], s[58:59], 16
+; SI-NEXT:    s_lshr_b64 s[24:25], s[56:57], 16
+; SI-NEXT:    s_lshr_b64 s[34:35], s[46:47], 16
+; SI-NEXT:    s_lshr_b64 s[94:95], s[44:45], 16
+; SI-NEXT:    s_lshr_b64 s[84:85], s[42:43], 16
+; SI-NEXT:    s_lshr_b32 s55, s75, 16
+; SI-NEXT:    s_lshr_b32 s53, s73, 16
+; SI-NEXT:    s_lshr_b32 s51, s63, 16
+; SI-NEXT:    s_lshr_b32 s49, s61, 16
+; SI-NEXT:    s_lshr_b32 s86, s59, 16
+; SI-NEXT:    s_lshr_b32 s39, s57, 16
+; SI-NEXT:    s_lshr_b32 s18, s47, 16
+; SI-NEXT:    s_lshr_b32 s22, s45, 16
+; SI-NEXT:    s_lshr_b32 s97, s43, 16
+; SI-NEXT:    s_lshr_b32 s65, s41, 16
+; SI-NEXT:    s_lshr_b32 s19, s15, 16
+; SI-NEXT:    s_lshr_b32 s71, s13, 16
+; SI-NEXT:    s_lshr_b32 s67, s11, 16
+; SI-NEXT:    s_lshr_b32 s80, s9, 16
+; SI-NEXT:    s_lshr_b32 s81, s7, 16
+; SI-NEXT:    s_lshr_b32 s77, s5, 16
 ; SI-NEXT:  .LBB97_3: ; %end
-; SI-NEXT:    s_and_b32 s4, s18, 0xffff
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v57
-; SI-NEXT:    v_or_b32_e32 v5, s4, v5
-; SI-NEXT:    s_and_b32 s4, s42, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s73, 16
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen
+; SI-NEXT:    s_and_b32 s16, s74, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s76, 16
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v1, s16
+; SI-NEXT:    s_and_b32 s16, s75, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s55, 16
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s72, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s26, 16
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
-; SI-NEXT:    v_mov_b32_e32 v6, s4
-; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; SI-NEXT:    s_and_b32 s4, s17, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v58
-; SI-NEXT:    v_or_b32_e32 v5, s4, v5
-; SI-NEXT:    s_and_b32 s4, s40, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s72, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 8, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s73, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s53, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 12, v0
-; SI-NEXT:    v_mov_b32_e32 v6, s4
-; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v47
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s62, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s28, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v56
-; SI-NEXT:    s_and_b32 s4, s15, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s63, 16
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 16, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s63, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s51, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 20, v0
-; SI-NEXT:    v_mov_b32_e32 v6, s4
-; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v45
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s60, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s88, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 20, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v51
-; SI-NEXT:    s_and_b32 s4, s14, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s62, 16
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 24, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s61, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s49, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 24, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 28, v0
-; SI-NEXT:    v_mov_b32_e32 v6, s4
-; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v17
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s58, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s20, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; SI-NEXT:    s_and_b32 s4, s13, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s61, 16
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 32, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s59, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s86, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 36, v0
-; SI-NEXT:    v_mov_b32_e32 v6, s4
-; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v16
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s56, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s24, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 36, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
-; SI-NEXT:    s_and_b32 s4, s12, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s60, 16
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 40, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s57, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s39, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 40, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 44, v0
-; SI-NEXT:    v_mov_b32_e32 v6, s4
-; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v15
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s46, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s34, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 44, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v44
-; SI-NEXT:    s_and_b32 s4, s10, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s59, 16
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s47, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s18, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 48, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 52, v0
-; SI-NEXT:    v_mov_b32_e32 v6, s4
-; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v12
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s44, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s94, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 52, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v43
-; SI-NEXT:    s_and_b32 s4, s8, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s58, 16
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 56, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s45, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s22, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 56, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 60, v0
-; SI-NEXT:    v_mov_b32_e32 v6, s4
-; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v10
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s42, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s84, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 60, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v42
-; SI-NEXT:    s_and_b32 s4, s36, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s57, 16
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 64, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s43, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s97, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 64, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x44, v0
-; SI-NEXT:    v_mov_b32_e32 v6, s4
-; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v9
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s40, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s38, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x44, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v41
-; SI-NEXT:    s_and_b32 s4, s53, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s56, 16
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s41, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s65, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x48, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x4c, v0
-; SI-NEXT:    v_mov_b32_e32 v6, s4
-; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v8
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s14, s14, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s48, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x4c, v0
+; SI-NEXT:    s_or_b32 s14, s14, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v40
-; SI-NEXT:    s_and_b32 s4, s94, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s47, 16
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x50, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s14
+; SI-NEXT:    s_and_b32 s14, s15, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s19, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x50, v0
+; SI-NEXT:    s_or_b32 s14, s14, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x54, v0
-; SI-NEXT:    v_mov_b32_e32 v6, s4
-; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v7
+; SI-NEXT:    v_mov_b32_e32 v2, s14
+; SI-NEXT:    s_and_b32 s12, s12, 0xffff
+; SI-NEXT:    s_lshl_b32 s14, s50, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x54, v0
+; SI-NEXT:    s_or_b32 s12, s12, s14
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v55
-; SI-NEXT:    s_and_b32 s4, s49, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s46, 16
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x58, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    s_and_b32 s12, s13, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s71, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x58, v0
+; SI-NEXT:    s_or_b32 s12, s12, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x5c, v0
-; SI-NEXT:    v_mov_b32_e32 v6, s4
-; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v54
-; SI-NEXT:    s_and_b32 s4, s48, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s45, 16
-; SI-NEXT:    v_or_b32_e32 v4, v4, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x60, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_lshl_b32 s12, s52, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x5c, v0
+; SI-NEXT:    s_or_b32 s10, s10, s12
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x64, v0
-; SI-NEXT:    v_mov_b32_e32 v5, s4
-; SI-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v53
-; SI-NEXT:    s_and_b32 s4, s11, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s44, 16
-; SI-NEXT:    v_or_b32_e32 v2, v2, v4
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x68, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s10, s11, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s67, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x60, v0
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
-; SI-NEXT:    v_mov_b32_e32 v4, s4
-; SI-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v31
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v52
-; SI-NEXT:    s_and_b32 s4, s9, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s43, 16
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    s_lshl_b32 s10, s54, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x64, v0
+; SI-NEXT:    s_or_b32 s8, s8, s10
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s8, s9, 0xffff
+; SI-NEXT:    s_lshl_b32 s9, s80, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x68, v0
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_lshl_b32 s8, s64, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x6c, v0
+; SI-NEXT:    s_or_b32 s6, s6, s8
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s6, s7, 0xffff
+; SI-NEXT:    s_lshl_b32 s7, s81, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x70, v0
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; SI-NEXT:    s_lshl_b32 s6, s66, 16
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x74, v0
-; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    s_or_b32 s4, s4, s6
 ; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v49
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v28
-; SI-NEXT:    s_and_b32 s4, s16, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s41, 16
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    s_and_b32 s4, s5, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s77, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x78, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; SI-NEXT:    v_readlane_b32 s99, v63, 35
-; SI-NEXT:    v_readlane_b32 s98, v63, 34
-; SI-NEXT:    v_readlane_b32 s97, v63, 33
-; SI-NEXT:    v_readlane_b32 s96, v63, 32
-; SI-NEXT:    v_readlane_b32 s87, v63, 31
-; SI-NEXT:    v_readlane_b32 s86, v63, 30
-; SI-NEXT:    v_readlane_b32 s85, v63, 29
-; SI-NEXT:    v_readlane_b32 s84, v63, 28
-; SI-NEXT:    v_readlane_b32 s83, v63, 27
-; SI-NEXT:    v_readlane_b32 s82, v63, 26
-; SI-NEXT:    v_readlane_b32 s81, v63, 25
-; SI-NEXT:    v_readlane_b32 s80, v63, 24
-; SI-NEXT:    v_readlane_b32 s71, v63, 23
-; SI-NEXT:    v_readlane_b32 s70, v63, 22
-; SI-NEXT:    v_readlane_b32 s69, v63, 21
-; SI-NEXT:    v_readlane_b32 s68, v63, 20
-; SI-NEXT:    v_readlane_b32 s67, v63, 19
-; SI-NEXT:    v_readlane_b32 s66, v63, 18
-; SI-NEXT:    v_readlane_b32 s65, v63, 17
-; SI-NEXT:    v_readlane_b32 s64, v63, 16
-; SI-NEXT:    v_readlane_b32 s55, v63, 15
-; SI-NEXT:    v_readlane_b32 s54, v63, 14
-; SI-NEXT:    v_readlane_b32 s53, v63, 13
-; SI-NEXT:    v_readlane_b32 s52, v63, 12
-; SI-NEXT:    v_readlane_b32 s51, v63, 11
-; SI-NEXT:    v_readlane_b32 s50, v63, 10
-; SI-NEXT:    v_readlane_b32 s49, v63, 9
-; SI-NEXT:    v_readlane_b32 s48, v63, 8
-; SI-NEXT:    v_readlane_b32 s39, v63, 7
-; SI-NEXT:    v_readlane_b32 s38, v63, 6
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT:    v_readlane_b32 s99, v41, 35
+; SI-NEXT:    v_readlane_b32 s98, v41, 34
+; SI-NEXT:    v_readlane_b32 s97, v41, 33
+; SI-NEXT:    v_readlane_b32 s96, v41, 32
+; SI-NEXT:    v_readlane_b32 s87, v41, 31
+; SI-NEXT:    v_readlane_b32 s86, v41, 30
+; SI-NEXT:    v_readlane_b32 s85, v41, 29
+; SI-NEXT:    v_readlane_b32 s84, v41, 28
+; SI-NEXT:    v_readlane_b32 s83, v41, 27
+; SI-NEXT:    v_readlane_b32 s82, v41, 26
+; SI-NEXT:    v_readlane_b32 s81, v41, 25
+; SI-NEXT:    v_readlane_b32 s80, v41, 24
+; SI-NEXT:    v_readlane_b32 s71, v41, 23
+; SI-NEXT:    v_readlane_b32 s70, v41, 22
+; SI-NEXT:    v_readlane_b32 s69, v41, 21
+; SI-NEXT:    v_readlane_b32 s68, v41, 20
+; SI-NEXT:    v_readlane_b32 s67, v41, 19
+; SI-NEXT:    v_readlane_b32 s66, v41, 18
+; SI-NEXT:    v_readlane_b32 s65, v41, 17
+; SI-NEXT:    v_readlane_b32 s64, v41, 16
+; SI-NEXT:    v_readlane_b32 s55, v41, 15
+; SI-NEXT:    v_readlane_b32 s54, v41, 14
+; SI-NEXT:    v_readlane_b32 s53, v41, 13
+; SI-NEXT:    v_readlane_b32 s52, v41, 12
+; SI-NEXT:    v_readlane_b32 s51, v41, 11
+; SI-NEXT:    v_readlane_b32 s50, v41, 10
+; SI-NEXT:    v_readlane_b32 s49, v41, 9
+; SI-NEXT:    v_readlane_b32 s48, v41, 8
+; SI-NEXT:    v_readlane_b32 s39, v41, 7
+; SI-NEXT:    v_readlane_b32 s38, v41, 6
+; SI-NEXT:    v_readlane_b32 s37, v41, 5
+; SI-NEXT:    v_readlane_b32 s36, v41, 4
+; SI-NEXT:    v_readlane_b32 s35, v41, 3
+; SI-NEXT:    v_readlane_b32 s34, v41, 2
+; SI-NEXT:    v_readlane_b32 s31, v41, 1
+; SI-NEXT:    v_readlane_b32 s30, v41, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB97_4:
-; SI-NEXT:    ; implicit-def: $sgpr18
-; SI-NEXT:    ; implicit-def: $vgpr57
-; SI-NEXT:    ; implicit-def: $sgpr42
-; SI-NEXT:    ; implicit-def: $sgpr73
-; SI-NEXT:    ; implicit-def: $sgpr17
-; SI-NEXT:    ; implicit-def: $vgpr58
-; SI-NEXT:    ; implicit-def: $sgpr40
+; SI-NEXT:    ; implicit-def: $sgpr74
+; SI-NEXT:    ; implicit-def: $sgpr76
+; SI-NEXT:    ; implicit-def: $sgpr55
 ; SI-NEXT:    ; implicit-def: $sgpr72
-; SI-NEXT:    ; implicit-def: $vgpr47
-; SI-NEXT:    ; implicit-def: $vgpr56
-; SI-NEXT:    ; implicit-def: $sgpr15
-; SI-NEXT:    ; implicit-def: $sgpr63
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr51
-; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr26
+; SI-NEXT:    ; implicit-def: $sgpr53
 ; SI-NEXT:    ; implicit-def: $sgpr62
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr13
-; SI-NEXT:    ; implicit-def: $sgpr61
-; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr28
+; SI-NEXT:    ; implicit-def: $sgpr51
 ; SI-NEXT:    ; implicit-def: $sgpr60
-; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr44
-; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $sgpr59
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $vgpr43
-; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    ; implicit-def: $sgpr49
 ; SI-NEXT:    ; implicit-def: $sgpr58
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr42
-; SI-NEXT:    ; implicit-def: $sgpr36
-; SI-NEXT:    ; implicit-def: $sgpr57
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr41
-; SI-NEXT:    ; implicit-def: $sgpr53
+; SI-NEXT:    ; implicit-def: $sgpr20
+; SI-NEXT:    ; implicit-def: $sgpr86
 ; SI-NEXT:    ; implicit-def: $sgpr56
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr40
-; SI-NEXT:    ; implicit-def: $sgpr94
-; SI-NEXT:    ; implicit-def: $sgpr47
-; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr55
-; SI-NEXT:    ; implicit-def: $sgpr49
+; SI-NEXT:    ; implicit-def: $sgpr24
+; SI-NEXT:    ; implicit-def: $sgpr39
 ; SI-NEXT:    ; implicit-def: $sgpr46
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr54
-; SI-NEXT:    ; implicit-def: $sgpr48
-; SI-NEXT:    ; implicit-def: $sgpr45
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr53
-; SI-NEXT:    ; implicit-def: $sgpr11
+; SI-NEXT:    ; implicit-def: $sgpr34
+; SI-NEXT:    ; implicit-def: $sgpr18
 ; SI-NEXT:    ; implicit-def: $sgpr44
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr52
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $sgpr43
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; implicit-def: $sgpr16
-; SI-NEXT:    ; implicit-def: $sgpr41
+; SI-NEXT:    ; implicit-def: $sgpr94
+; SI-NEXT:    ; implicit-def: $sgpr22
+; SI-NEXT:    ; implicit-def: $sgpr42
+; SI-NEXT:    ; implicit-def: $sgpr84
+; SI-NEXT:    ; implicit-def: $sgpr97
+; SI-NEXT:    ; implicit-def: $sgpr40
+; SI-NEXT:    ; implicit-def: $sgpr38
+; SI-NEXT:    ; implicit-def: $sgpr65
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr19
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; implicit-def: $sgpr71
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr52
+; SI-NEXT:    ; implicit-def: $sgpr67
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; implicit-def: $sgpr80
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr64
+; SI-NEXT:    ; implicit-def: $sgpr81
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr66
+; SI-NEXT:    ; implicit-def: $sgpr77
 ; SI-NEXT:    s_branch .LBB97_2
 ;
 ; VI-LABEL: bitcast_v128i8_to_v64i16_scalar:
@@ -208926,1387 +210637,1672 @@ define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i3
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:80
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; SI-NEXT:    v_writelane_b32 v63, s30, 0
-; SI-NEXT:    v_writelane_b32 v63, s31, 1
-; SI-NEXT:    v_writelane_b32 v63, s34, 2
-; SI-NEXT:    v_writelane_b32 v63, s35, 3
-; SI-NEXT:    v_writelane_b32 v63, s36, 4
-; SI-NEXT:    v_writelane_b32 v63, s37, 5
-; SI-NEXT:    v_writelane_b32 v63, s38, 6
-; SI-NEXT:    v_writelane_b32 v63, s39, 7
-; SI-NEXT:    v_writelane_b32 v63, s48, 8
-; SI-NEXT:    v_writelane_b32 v63, s49, 9
-; SI-NEXT:    v_writelane_b32 v63, s50, 10
-; SI-NEXT:    v_writelane_b32 v63, s51, 11
-; SI-NEXT:    v_writelane_b32 v63, s52, 12
-; SI-NEXT:    v_writelane_b32 v63, s53, 13
-; SI-NEXT:    v_writelane_b32 v63, s54, 14
-; SI-NEXT:    v_writelane_b32 v63, s55, 15
-; SI-NEXT:    v_writelane_b32 v63, s64, 16
-; SI-NEXT:    v_writelane_b32 v63, s65, 17
-; SI-NEXT:    v_writelane_b32 v63, s66, 18
-; SI-NEXT:    v_writelane_b32 v63, s67, 19
-; SI-NEXT:    v_writelane_b32 v63, s68, 20
-; SI-NEXT:    v_writelane_b32 v63, s69, 21
-; SI-NEXT:    v_writelane_b32 v63, s70, 22
-; SI-NEXT:    v_writelane_b32 v63, s71, 23
-; SI-NEXT:    v_writelane_b32 v63, s80, 24
-; SI-NEXT:    v_writelane_b32 v63, s81, 25
-; SI-NEXT:    v_writelane_b32 v63, s82, 26
-; SI-NEXT:    v_writelane_b32 v63, s83, 27
-; SI-NEXT:    v_writelane_b32 v63, s84, 28
-; SI-NEXT:    v_writelane_b32 v63, s85, 29
-; SI-NEXT:    v_writelane_b32 v63, s86, 30
-; SI-NEXT:    v_writelane_b32 v63, s87, 31
-; SI-NEXT:    v_writelane_b32 v63, s96, 32
-; SI-NEXT:    v_writelane_b32 v63, s97, 33
-; SI-NEXT:    v_writelane_b32 v63, s98, 34
-; SI-NEXT:    s_mov_b32 s6, s18
-; SI-NEXT:    ; implicit-def: $vgpr62 : SGPR spill to VGPR lane
-; SI-NEXT:    v_writelane_b32 v63, s99, 35
-; SI-NEXT:    v_readfirstlane_b32 s62, v30
-; SI-NEXT:    v_readfirstlane_b32 s63, v29
-; SI-NEXT:    v_readfirstlane_b32 s59, v26
-; SI-NEXT:    v_readfirstlane_b32 s60, v25
-; SI-NEXT:    v_readfirstlane_b32 s98, v22
-; SI-NEXT:    v_readfirstlane_b32 s61, v21
-; SI-NEXT:    v_readfirstlane_b32 s99, v18
-; SI-NEXT:    v_readfirstlane_b32 s58, v17
-; SI-NEXT:    v_readfirstlane_b32 s96, v14
-; SI-NEXT:    v_readfirstlane_b32 s97, v13
-; SI-NEXT:    v_readfirstlane_b32 s86, v10
-; SI-NEXT:    v_readfirstlane_b32 s87, v9
-; SI-NEXT:    v_readfirstlane_b32 s84, v6
-; SI-NEXT:    v_readfirstlane_b32 s85, v5
-; SI-NEXT:    v_readfirstlane_b32 s81, v2
-; SI-NEXT:    v_readfirstlane_b32 s82, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; SI-NEXT:    v_writelane_b32 v62, s6, 0
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_readfirstlane_b32 s88, v32
-; SI-NEXT:    v_readfirstlane_b32 s18, v34
-; SI-NEXT:    v_readfirstlane_b32 s78, v38
-; SI-NEXT:    v_readfirstlane_b32 s79, v39
-; SI-NEXT:    v_readfirstlane_b32 s76, v48
-; SI-NEXT:    v_readfirstlane_b32 s77, v49
-; SI-NEXT:    v_readfirstlane_b32 s74, v50
-; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_readfirstlane_b32 s75, v51
-; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_readfirstlane_b32 s72, v52
-; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s73, v53
-; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v54
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v24
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_lshlrev_b32_e32 v49, 16, v40
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v41
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v42
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v43
-; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v55
-; SI-NEXT:    s_cbranch_scc0 .LBB99_4
-; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_and_b32 s4, s16, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s17, 16
-; SI-NEXT:    s_or_b32 s56, s4, s5
-; SI-NEXT:    s_and_b32 s4, s6, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s19, 16
-; SI-NEXT:    s_or_b32 s57, s4, s5
-; SI-NEXT:    v_mov_b32_e32 v1, s56
-; SI-NEXT:    v_alignbit_b32 v2, s57, v1, 24
-; SI-NEXT:    s_and_b32 s4, s20, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s21, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, s57, v1, 16
-; SI-NEXT:    v_alignbit_b32 v1, s57, v1, 8
-; SI-NEXT:    s_or_b32 s46, s4, s5
-; SI-NEXT:    s_and_b32 s4, s22, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s23, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    s_or_b32 s47, s4, s5
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v1, s46
-; SI-NEXT:    s_and_b32 s4, s24, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s25, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, s47, v1, 24
-; SI-NEXT:    s_or_b32 s44, s4, s5
-; SI-NEXT:    s_and_b32 s4, s26, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s27, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, s47, v1, 16
-; SI-NEXT:    v_alignbit_b32 v51, s47, v1, 8
-; SI-NEXT:    s_or_b32 s45, s4, s5
-; SI-NEXT:    v_mov_b32_e32 v1, s44
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, s45, v1, 24
-; SI-NEXT:    s_and_b32 s4, s28, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s29, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, s45, v1, 16
-; SI-NEXT:    v_alignbit_b32 v1, s45, v1, 8
-; SI-NEXT:    s_or_b32 s42, s4, s5
-; SI-NEXT:    s_and_b32 s4, s82, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s81, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    s_or_b32 s43, s4, s5
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v1, s42
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, s43, v1, 24
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, s43, v1, 16
-; SI-NEXT:    v_alignbit_b32 v1, s43, v1, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; SI-NEXT:    s_and_b32 s4, s85, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s84, 16
-; SI-NEXT:    v_or_b32_e32 v16, v1, v25
-; SI-NEXT:    s_or_b32 s41, s4, s5
-; SI-NEXT:    v_alignbit_b32 v1, s41, v16, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, s41, v16, 16
-; SI-NEXT:    s_and_b32 s4, s87, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s86, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, s41, v16, 8
-; SI-NEXT:    s_or_b32 s40, s4, s5
-; SI-NEXT:    s_and_b32 s4, s97, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s96, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v7
-; SI-NEXT:    s_or_b32 s15, s4, s5
-; SI-NEXT:    s_and_b32 s4, s58, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s99, 16
-; SI-NEXT:    v_or_b32_e32 v14, v1, v26
-; SI-NEXT:    s_or_b32 s14, s4, s5
-; SI-NEXT:    s_and_b32 s4, s61, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s98, 16
-; SI-NEXT:    v_alignbit_b32 v1, s40, v14, 24
-; SI-NEXT:    s_or_b32 s13, s4, s5
-; SI-NEXT:    s_and_b32 s4, s60, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s59, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, s40, v14, 16
-; SI-NEXT:    s_or_b32 s12, s4, s5
-; SI-NEXT:    s_and_b32 s4, s63, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s62, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, s40, v14, 8
-; SI-NEXT:    s_or_b32 s11, s4, s5
-; SI-NEXT:    s_and_b32 s4, s73, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s72, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v11
-; SI-NEXT:    s_or_b32 s10, s4, s5
-; SI-NEXT:    s_and_b32 s4, s75, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s74, 16
-; SI-NEXT:    v_or_b32_e32 v12, v1, v29
-; SI-NEXT:    s_or_b32 s9, s4, s5
-; SI-NEXT:    s_and_b32 s4, s77, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s76, 16
-; SI-NEXT:    v_alignbit_b32 v1, s15, v12, 24
-; SI-NEXT:    s_or_b32 s8, s4, s5
-; SI-NEXT:    s_and_b32 s4, s79, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s78, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, s15, v12, 16
-; SI-NEXT:    s_or_b32 s7, s4, s5
-; SI-NEXT:    s_and_b32 s4, s18, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s88, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, s15, v12, 8
-; SI-NEXT:    s_or_b32 s6, s4, s5
-; SI-NEXT:    s_lshr_b32 s4, s11, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v15
-; SI-NEXT:    v_writelane_b32 v62, s4, 1
-; SI-NEXT:    s_lshr_b32 s4, s10, 8
-; SI-NEXT:    v_or_b32_e32 v10, v1, v30
-; SI-NEXT:    v_writelane_b32 v62, s4, 3
-; SI-NEXT:    s_lshr_b32 s4, s9, 8
-; SI-NEXT:    v_alignbit_b32 v1, s14, v10, 24
-; SI-NEXT:    v_writelane_b32 v62, s4, 6
-; SI-NEXT:    s_lshr_b32 s4, s8, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, s14, v10, 16
-; SI-NEXT:    v_writelane_b32 v62, s4, 9
-; SI-NEXT:    s_lshr_b32 s4, s7, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, s14, v10, 8
-; SI-NEXT:    v_writelane_b32 v62, s4, 12
-; SI-NEXT:    s_lshr_b32 s4, s6, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v19
-; SI-NEXT:    v_writelane_b32 v62, s4, 15
-; SI-NEXT:    s_and_b32 s4, s72, 0xffff
-; SI-NEXT:    v_or_b32_e32 v8, v1, v32
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v23
-; SI-NEXT:    v_writelane_b32 v62, s4, 2
-; SI-NEXT:    s_and_b32 s4, s74, 0xffff
-; SI-NEXT:    v_or_b32_e32 v5, v1, v34
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v27
-; SI-NEXT:    v_writelane_b32 v62, s4, 5
-; SI-NEXT:    s_and_b32 s4, s76, 0xffff
-; SI-NEXT:    v_or_b32_e32 v13, v1, v28
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v31
-; SI-NEXT:    v_writelane_b32 v62, s4, 8
-; SI-NEXT:    s_and_b32 s4, s78, 0xffff
-; SI-NEXT:    v_or_b32_e32 v9, v1, v38
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v33
-; SI-NEXT:    v_writelane_b32 v62, s4, 11
-; SI-NEXT:    s_and_b32 s4, s88, 0xffff
-; SI-NEXT:    v_or_b32_e32 v6, v1, v39
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v35
-; SI-NEXT:    v_writelane_b32 v62, s4, 14
-; SI-NEXT:    s_bfe_u32 s4, s74, 0x80008
-; SI-NEXT:    v_or_b32_e32 v4, v1, v48
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v36
-; SI-NEXT:    v_writelane_b32 v62, s4, 4
-; SI-NEXT:    s_bfe_u32 s4, s76, 0x80008
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_or_b32_e32 v2, v1, v49
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v37
-; SI-NEXT:    v_writelane_b32 v62, s4, 7
-; SI-NEXT:    s_bfe_u32 s4, s78, 0x80008
-; SI-NEXT:    v_or_b32_e32 v1, v1, v50
-; SI-NEXT:    v_writelane_b32 v62, s4, 10
-; SI-NEXT:    s_bfe_u32 s4, s88, 0x80008
-; SI-NEXT:    s_lshr_b32 s68, s57, 8
-; SI-NEXT:    s_lshr_b32 s65, s47, 8
-; SI-NEXT:    s_lshr_b32 s54, s45, 8
-; SI-NEXT:    s_lshr_b32 s51, s43, 8
-; SI-NEXT:    s_lshr_b32 s48, s41, 8
-; SI-NEXT:    s_lshr_b32 s37, s40, 8
-; SI-NEXT:    s_lshr_b32 s34, s15, 8
-; SI-NEXT:    s_lshr_b32 s95, s14, 8
-; SI-NEXT:    s_lshr_b32 s92, s13, 8
-; SI-NEXT:    s_lshr_b32 s89, s12, 8
-; SI-NEXT:    s_and_b32 s71, s19, 0xffff
-; SI-NEXT:    s_and_b32 s69, s23, 0xffff
-; SI-NEXT:    s_and_b32 s66, s27, 0xffff
-; SI-NEXT:    s_and_b32 s55, s81, 0xffff
-; SI-NEXT:    s_and_b32 s52, s84, 0xffff
-; SI-NEXT:    s_and_b32 s49, s86, 0xffff
-; SI-NEXT:    s_and_b32 s38, s96, 0xffff
-; SI-NEXT:    s_and_b32 s35, s99, 0xffff
-; SI-NEXT:    s_and_b32 s30, s98, 0xffff
-; SI-NEXT:    s_and_b32 s93, s59, 0xffff
-; SI-NEXT:    s_and_b32 s90, s62, 0xffff
-; SI-NEXT:    s_bfe_u32 s83, s19, 0x80008
-; SI-NEXT:    s_bfe_u32 s80, s23, 0x80008
-; SI-NEXT:    s_bfe_u32 s70, s27, 0x80008
-; SI-NEXT:    s_bfe_u32 s67, s81, 0x80008
-; SI-NEXT:    s_bfe_u32 s64, s84, 0x80008
-; SI-NEXT:    s_bfe_u32 s53, s86, 0x80008
-; SI-NEXT:    s_bfe_u32 s50, s96, 0x80008
-; SI-NEXT:    s_bfe_u32 s39, s99, 0x80008
-; SI-NEXT:    s_bfe_u32 s36, s98, 0x80008
-; SI-NEXT:    s_bfe_u32 s31, s59, 0x80008
-; SI-NEXT:    s_bfe_u32 s94, s62, 0x80008
-; SI-NEXT:    s_bfe_u32 s91, s72, 0x80008
-; SI-NEXT:    v_writelane_b32 v62, s4, 13
-; SI-NEXT:    v_alignbit_b32 v45, s13, v8, 24
-; SI-NEXT:    v_alignbit_b32 v47, s13, v8, 16
-; SI-NEXT:    v_alignbit_b32 v57, s13, v8, 8
-; SI-NEXT:    v_alignbit_b32 v41, s12, v5, 24
-; SI-NEXT:    v_alignbit_b32 v43, s12, v5, 16
-; SI-NEXT:    v_alignbit_b32 v44, s12, v5, 8
-; SI-NEXT:    v_alignbit_b32 v21, s11, v13, 24
-; SI-NEXT:    v_alignbit_b32 v22, s11, v13, 16
-; SI-NEXT:    v_alignbit_b32 v24, s11, v13, 8
-; SI-NEXT:    v_alignbit_b32 v17, s10, v9, 24
-; SI-NEXT:    v_alignbit_b32 v18, s10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v20, s10, v9, 8
-; SI-NEXT:    v_alignbit_b32 v59, s9, v6, 24
-; SI-NEXT:    v_alignbit_b32 v60, s9, v6, 16
-; SI-NEXT:    v_alignbit_b32 v61, s9, v6, 8
-; SI-NEXT:    v_alignbit_b32 v46, s8, v4, 24
-; SI-NEXT:    v_alignbit_b32 v56, s8, v4, 16
-; SI-NEXT:    v_alignbit_b32 v58, s8, v4, 8
-; SI-NEXT:    v_alignbit_b32 v55, s7, v2, 24
-; SI-NEXT:    v_alignbit_b32 v40, s7, v2, 16
-; SI-NEXT:    v_alignbit_b32 v42, s7, v2, 8
-; SI-NEXT:    v_alignbit_b32 v52, s6, v1, 24
-; SI-NEXT:    v_alignbit_b32 v53, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v54, s6, v1, 8
-; SI-NEXT:    s_cbranch_execnz .LBB99_3
-; SI-NEXT:  .LBB99_2: ; %cmp.true
-; SI-NEXT:    s_add_i32 s18, s18, 3
-; SI-NEXT:    s_and_b32 s4, s18, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s88, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s79, s79, 3
-; SI-NEXT:    s_add_i32 s6, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s79, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s78, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s77, s77, 3
-; SI-NEXT:    s_add_i32 s7, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s77, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s76, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s75, s75, 3
-; SI-NEXT:    s_add_i32 s8, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s75, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s74, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s73, s73, 3
-; SI-NEXT:    s_add_i32 s9, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s73, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s72, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s63, s63, 3
-; SI-NEXT:    s_add_i32 s10, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s63, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s62, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s60, s60, 3
-; SI-NEXT:    s_add_i32 s11, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s60, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s59, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s61, s61, 3
-; SI-NEXT:    s_add_i32 s12, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s61, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s98, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s58, s58, 3
-; SI-NEXT:    s_add_i32 s13, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s58, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s99, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s97, s97, 3
-; SI-NEXT:    s_add_i32 s14, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s97, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s96, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s87, s87, 3
-; SI-NEXT:    s_add_i32 s15, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s87, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s86, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s85, s85, 3
-; SI-NEXT:    s_add_i32 s40, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s85, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s84, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s28, s28, 3
-; SI-NEXT:    s_add_i32 s41, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s28, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s29, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s82, s82, 3
-; SI-NEXT:    s_add_i32 s42, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s82, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s81, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s24, s24, 3
-; SI-NEXT:    s_add_i32 s43, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s24, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s25, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s26, s26, 3
-; SI-NEXT:    s_add_i32 s44, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s26, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s27, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s20, s20, 3
-; SI-NEXT:    s_add_i32 s45, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s20, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s21, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s22, s22, 3
-; SI-NEXT:    s_add_i32 s46, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s22, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s23, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s16, s16, 3
-; SI-NEXT:    s_add_i32 s47, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s16, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s17, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s56, s4, 0x30000
-; SI-NEXT:    v_readlane_b32 s4, v62, 0
-; SI-NEXT:    s_add_i32 s4, s4, 3
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s19, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s57, s4, 0x30000
-; SI-NEXT:    v_mov_b32_e32 v21, s56
-; SI-NEXT:    v_alignbit_b32 v22, s57, v21, 24
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v22, s57, v21, 16
-; SI-NEXT:    v_alignbit_b32 v21, s57, v21, 8
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v21, s46
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v22, s47, v21, 24
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v22, s47, v21, 16
-; SI-NEXT:    v_alignbit_b32 v51, s47, v21, 8
-; SI-NEXT:    v_mov_b32_e32 v21, s44
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v22, s45, v21, 24
-; SI-NEXT:    s_lshr_b32 s4, s11, 8
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v3
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v22, s45, v21, 16
-; SI-NEXT:    v_alignbit_b32 v21, s45, v21, 8
-; SI-NEXT:    v_writelane_b32 v62, s4, 1
-; SI-NEXT:    s_lshr_b32 s4, s10, 16
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v33
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v21, s42
-; SI-NEXT:    v_writelane_b32 v62, s4, 2
-; SI-NEXT:    s_lshr_b32 s4, s10, 8
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 3, v7
-; SI-NEXT:    v_or_b32_e32 v3, v25, v3
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v22, s43, v21, 24
-; SI-NEXT:    v_writelane_b32 v62, s4, 3
-; SI-NEXT:    s_lshr_b32 s4, s9, 24
-; SI-NEXT:    v_or_b32_e32 v5, v39, v5
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; SI-NEXT:    v_add_i32_e32 v16, vcc, 0x30000, v3
-; SI-NEXT:    v_mov_b32_e32 v3, s41
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v22, s43, v21, 16
-; SI-NEXT:    v_alignbit_b32 v21, s43, v21, 8
-; SI-NEXT:    v_writelane_b32 v62, s4, 4
-; SI-NEXT:    s_lshr_b32 s4, s9, 16
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x30000, v5
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v31
-; SI-NEXT:    v_or_b32_e32 v7, v26, v7
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v21, v3, v16, 24
-; SI-NEXT:    v_writelane_b32 v62, s4, 5
-; SI-NEXT:    s_lshr_b32 s4, s9, 8
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 3, v11
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 0x30000, v7
-; SI-NEXT:    v_mov_b32_e32 v7, s40
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v21, v3, v16, 16
-; SI-NEXT:    v_alignbit_b32 v3, v3, v16, 8
-; SI-NEXT:    v_writelane_b32 v62, s4, 6
-; SI-NEXT:    s_lshr_b32 s4, s8, 24
-; SI-NEXT:    v_or_b32_e32 v5, v38, v5
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v3, v7, v14, 24
-; SI-NEXT:    v_writelane_b32 v62, s4, 7
-; SI-NEXT:    s_lshr_b32 s4, s8, 16
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 0x30000, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v27
-; SI-NEXT:    v_or_b32_e32 v11, v29, v11
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v3, v7, v14, 16
-; SI-NEXT:    v_writelane_b32 v62, s4, 8
-; SI-NEXT:    s_lshr_b32 s4, s8, 8
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 3, v15
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 0x30000, v11
-; SI-NEXT:    v_mov_b32_e32 v11, s15
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v3, v7, v14, 8
-; SI-NEXT:    v_writelane_b32 v62, s4, 9
-; SI-NEXT:    s_lshr_b32 s4, s7, 24
-; SI-NEXT:    v_or_b32_e32 v5, v28, v5
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v3, v11, v12, 24
-; SI-NEXT:    v_writelane_b32 v62, s4, 10
-; SI-NEXT:    s_lshr_b32 s4, s7, 16
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v37
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v36
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 3, v35
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 0x30000, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v23
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 3, v19
-; SI-NEXT:    v_or_b32_e32 v10, v30, v10
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v3, v11, v12, 16
-; SI-NEXT:    v_writelane_b32 v62, s4, 11
-; SI-NEXT:    s_lshr_b32 s4, s7, 8
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 0x30000, v10
-; SI-NEXT:    v_mov_b32_e32 v15, s14
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v3, v11, v12, 8
-; SI-NEXT:    v_writelane_b32 v62, s4, 12
-; SI-NEXT:    s_lshr_b32 s4, s6, 24
-; SI-NEXT:    v_or_b32_e32 v1, v50, v1
-; SI-NEXT:    v_or_b32_e32 v2, v49, v2
-; SI-NEXT:    v_or_b32_e32 v4, v48, v4
-; SI-NEXT:    v_or_b32_e32 v5, v34, v5
-; SI-NEXT:    v_or_b32_e32 v8, v32, v8
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v3, v15, v10, 24
-; SI-NEXT:    v_writelane_b32 v62, s4, 13
-; SI-NEXT:    s_lshr_b32 s4, s6, 16
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x30000, v1
-; SI-NEXT:    v_mov_b32_e32 v37, s6
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x30000, v2
-; SI-NEXT:    v_mov_b32_e32 v36, s7
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x30000, v4
-; SI-NEXT:    v_mov_b32_e32 v35, s8
-; SI-NEXT:    v_mov_b32_e32 v33, s9
-; SI-NEXT:    v_mov_b32_e32 v20, s10
-; SI-NEXT:    v_mov_b32_e32 v17, s11
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x30000, v5
-; SI-NEXT:    v_mov_b32_e32 v18, s12
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x30000, v8
-; SI-NEXT:    v_mov_b32_e32 v19, s13
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v3, v15, v10, 16
-; SI-NEXT:    v_writelane_b32 v62, s4, 14
-; SI-NEXT:    s_lshr_b32 s4, s6, 8
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v3, v15, v10, 8
-; SI-NEXT:    v_alignbit_b32 v45, v19, v8, 24
-; SI-NEXT:    v_alignbit_b32 v47, v19, v8, 16
-; SI-NEXT:    v_alignbit_b32 v57, v19, v8, 8
-; SI-NEXT:    v_alignbit_b32 v41, v18, v5, 24
-; SI-NEXT:    v_alignbit_b32 v43, v18, v5, 16
-; SI-NEXT:    v_alignbit_b32 v44, v18, v5, 8
-; SI-NEXT:    v_alignbit_b32 v21, v17, v13, 24
-; SI-NEXT:    v_alignbit_b32 v22, v17, v13, 16
-; SI-NEXT:    v_alignbit_b32 v24, v17, v13, 8
-; SI-NEXT:    v_alignbit_b32 v17, v20, v9, 24
-; SI-NEXT:    v_alignbit_b32 v18, v20, v9, 16
-; SI-NEXT:    v_alignbit_b32 v20, v20, v9, 8
-; SI-NEXT:    v_alignbit_b32 v59, v33, v6, 24
-; SI-NEXT:    v_alignbit_b32 v60, v33, v6, 16
-; SI-NEXT:    v_alignbit_b32 v61, v33, v6, 8
-; SI-NEXT:    v_alignbit_b32 v46, v35, v4, 24
-; SI-NEXT:    v_alignbit_b32 v56, v35, v4, 16
-; SI-NEXT:    v_alignbit_b32 v58, v35, v4, 8
-; SI-NEXT:    v_alignbit_b32 v55, v36, v2, 24
-; SI-NEXT:    v_alignbit_b32 v40, v36, v2, 16
-; SI-NEXT:    v_alignbit_b32 v42, v36, v2, 8
-; SI-NEXT:    v_alignbit_b32 v52, v37, v1, 24
-; SI-NEXT:    v_alignbit_b32 v53, v37, v1, 16
-; SI-NEXT:    v_alignbit_b32 v54, v37, v1, 8
-; SI-NEXT:    s_lshr_b32 s83, s57, 24
-; SI-NEXT:    s_lshr_b32 s71, s57, 16
-; SI-NEXT:    s_lshr_b32 s68, s57, 8
-; SI-NEXT:    s_lshr_b32 s80, s47, 24
-; SI-NEXT:    s_lshr_b32 s69, s47, 16
-; SI-NEXT:    s_lshr_b32 s65, s47, 8
-; SI-NEXT:    s_lshr_b32 s70, s45, 24
-; SI-NEXT:    s_lshr_b32 s66, s45, 16
-; SI-NEXT:    s_lshr_b32 s54, s45, 8
-; SI-NEXT:    s_lshr_b32 s67, s43, 24
-; SI-NEXT:    s_lshr_b32 s55, s43, 16
-; SI-NEXT:    s_lshr_b32 s51, s43, 8
-; SI-NEXT:    s_lshr_b32 s64, s41, 24
-; SI-NEXT:    s_lshr_b32 s52, s41, 16
-; SI-NEXT:    s_lshr_b32 s48, s41, 8
-; SI-NEXT:    s_lshr_b32 s53, s40, 24
-; SI-NEXT:    s_lshr_b32 s49, s40, 16
-; SI-NEXT:    s_lshr_b32 s37, s40, 8
-; SI-NEXT:    s_lshr_b32 s50, s15, 24
-; SI-NEXT:    s_lshr_b32 s38, s15, 16
-; SI-NEXT:    s_lshr_b32 s34, s15, 8
-; SI-NEXT:    s_lshr_b32 s39, s14, 24
-; SI-NEXT:    s_lshr_b32 s35, s14, 16
-; SI-NEXT:    s_lshr_b32 s95, s14, 8
-; SI-NEXT:    s_lshr_b32 s36, s13, 24
-; SI-NEXT:    s_lshr_b32 s30, s13, 16
-; SI-NEXT:    s_lshr_b32 s92, s13, 8
-; SI-NEXT:    s_lshr_b32 s31, s12, 24
-; SI-NEXT:    s_lshr_b32 s93, s12, 16
-; SI-NEXT:    s_lshr_b32 s89, s12, 8
-; SI-NEXT:    s_lshr_b32 s94, s11, 24
-; SI-NEXT:    s_lshr_b32 s90, s11, 16
-; SI-NEXT:    s_lshr_b32 s91, s10, 24
-; SI-NEXT:    v_writelane_b32 v62, s4, 15
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:  .LBB99_3: ; %end
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; SI-NEXT:    s_and_b32 s4, s56, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s68, 8
-; SI-NEXT:    s_lshl_b32 s16, s83, 24
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_readlane_b32 s99, v63, 35
-; SI-NEXT:    v_readlane_b32 s98, v63, 34
-; SI-NEXT:    v_readlane_b32 s97, v63, 33
-; SI-NEXT:    v_readlane_b32 s96, v63, 32
-; SI-NEXT:    v_readlane_b32 s87, v63, 31
-; SI-NEXT:    v_readlane_b32 s86, v63, 30
-; SI-NEXT:    v_readlane_b32 s85, v63, 29
-; SI-NEXT:    v_readlane_b32 s84, v63, 28
-; SI-NEXT:    v_readlane_b32 s83, v63, 27
-; SI-NEXT:    v_readlane_b32 s82, v63, 26
-; SI-NEXT:    v_readlane_b32 s81, v63, 25
-; SI-NEXT:    v_readlane_b32 s68, v63, 20
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; SI-NEXT:    v_or_b32_e32 v3, v7, v3
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; SI-NEXT:    v_or_b32_e32 v7, s4, v7
-; SI-NEXT:    s_and_b32 s4, s57, 0xff
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s71, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
-; SI-NEXT:    v_or_b32_e32 v3, v7, v3
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 4, v0
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    s_and_b32 s4, s46, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v51
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s47, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s65, 8
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s69, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s80, 24
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_lshl_b32 s5, s54, 8
-; SI-NEXT:    s_lshl_b32 s16, s70, 24
-; SI-NEXT:    v_readlane_b32 s80, v63, 24
-; SI-NEXT:    v_readlane_b32 s71, v63, 23
-; SI-NEXT:    v_readlane_b32 s70, v63, 22
-; SI-NEXT:    v_readlane_b32 s69, v63, 21
-; SI-NEXT:    v_readlane_b32 s65, v63, 17
-; SI-NEXT:    v_readlane_b32 s54, v63, 14
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; SI-NEXT:    v_or_b32_e32 v7, v11, v7
-; SI-NEXT:    v_or_b32_e32 v3, v3, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 8, v0
-; SI-NEXT:    buffer_store_dword v3, v7, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    s_and_b32 s4, s44, 0xff
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s45, 0xff
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s66, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v7, v11, v7
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 16, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v7, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    s_and_b32 s4, s42, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s51, 8
-; SI-NEXT:    s_lshl_b32 s16, s67, 24
-; SI-NEXT:    v_readlane_b32 s67, v63, 19
-; SI-NEXT:    v_readlane_b32 s66, v63, 18
-; SI-NEXT:    v_readlane_b32 s51, v63, 11
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s43, 0xff
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s55, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v7, v11, v7
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 24, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v7, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v16
-; SI-NEXT:    s_and_b32 s4, s41, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s48, 8
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s52, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s64, 24
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_lshl_b32 s5, s37, 8
-; SI-NEXT:    s_lshl_b32 s16, s53, 24
-; SI-NEXT:    v_readlane_b32 s64, v63, 16
-; SI-NEXT:    v_readlane_b32 s55, v63, 15
-; SI-NEXT:    v_readlane_b32 s53, v63, 13
-; SI-NEXT:    v_readlane_b32 s52, v63, 12
-; SI-NEXT:    v_readlane_b32 s48, v63, 8
-; SI-NEXT:    v_readlane_b32 s37, v63, 5
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; SI-NEXT:    v_or_b32_e32 v3, v3, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_or_b32_e32 v7, v11, v7
-; SI-NEXT:    v_or_b32_e32 v3, v3, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 32, v0
-; SI-NEXT:    buffer_store_dword v3, v7, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 36, v0
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v14
-; SI-NEXT:    s_and_b32 s4, s40, 0xff
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s49, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_lshl_b32 s5, s34, 8
-; SI-NEXT:    v_readlane_b32 s49, v63, 9
-; SI-NEXT:    v_readlane_b32 s34, v63, 2
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; SI-NEXT:    v_or_b32_e32 v3, v3, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:68
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:56
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:48
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    v_writelane_b32 v40, s30, 0
+; SI-NEXT:    v_writelane_b32 v40, s31, 1
+; SI-NEXT:    v_writelane_b32 v40, s34, 2
+; SI-NEXT:    v_writelane_b32 v40, s35, 3
+; SI-NEXT:    v_writelane_b32 v40, s36, 4
+; SI-NEXT:    v_writelane_b32 v40, s37, 5
+; SI-NEXT:    v_writelane_b32 v40, s38, 6
+; SI-NEXT:    v_writelane_b32 v40, s39, 7
+; SI-NEXT:    v_writelane_b32 v40, s48, 8
+; SI-NEXT:    v_writelane_b32 v40, s49, 9
+; SI-NEXT:    v_writelane_b32 v40, s50, 10
+; SI-NEXT:    v_writelane_b32 v40, s51, 11
+; SI-NEXT:    v_writelane_b32 v40, s52, 12
+; SI-NEXT:    v_writelane_b32 v40, s53, 13
+; SI-NEXT:    v_writelane_b32 v40, s54, 14
+; SI-NEXT:    v_writelane_b32 v40, s55, 15
+; SI-NEXT:    v_writelane_b32 v40, s64, 16
+; SI-NEXT:    v_writelane_b32 v40, s65, 17
+; SI-NEXT:    v_writelane_b32 v40, s66, 18
+; SI-NEXT:    v_writelane_b32 v40, s67, 19
+; SI-NEXT:    v_writelane_b32 v40, s68, 20
+; SI-NEXT:    v_writelane_b32 v40, s69, 21
+; SI-NEXT:    v_writelane_b32 v40, s70, 22
+; SI-NEXT:    s_mov_b32 s88, s17
+; SI-NEXT:    v_writelane_b32 v40, s71, 23
+; SI-NEXT:    v_writelane_b32 v40, s80, 24
+; SI-NEXT:    v_writelane_b32 v40, s81, 25
+; SI-NEXT:    v_writelane_b32 v40, s82, 26
+; SI-NEXT:    v_writelane_b32 v40, s83, 27
+; SI-NEXT:    v_readfirstlane_b32 s6, v16
+; SI-NEXT:    ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
+; SI-NEXT:    v_readfirstlane_b32 s7, v15
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_writelane_b32 v41, s6, 0
+; SI-NEXT:    v_readfirstlane_b32 s8, v21
+; SI-NEXT:    v_writelane_b32 v41, s7, 1
+; SI-NEXT:    v_readfirstlane_b32 s9, v20
+; SI-NEXT:    v_writelane_b32 v41, s8, 2
+; SI-NEXT:    v_readfirstlane_b32 s10, v19
+; SI-NEXT:    v_writelane_b32 v41, s9, 3
+; SI-NEXT:    v_readfirstlane_b32 s11, v25
+; SI-NEXT:    v_writelane_b32 v41, s10, 4
+; SI-NEXT:    v_readfirstlane_b32 s12, v24
+; SI-NEXT:    v_writelane_b32 v41, s11, 5
+; SI-NEXT:    v_readfirstlane_b32 s13, v23
+; SI-NEXT:    v_writelane_b32 v41, s12, 6
+; SI-NEXT:    v_readfirstlane_b32 s15, v29
+; SI-NEXT:    v_writelane_b32 v41, s13, 7
+; SI-NEXT:    v_readfirstlane_b32 s14, v28
+; SI-NEXT:    v_writelane_b32 v41, s15, 8
+; SI-NEXT:    s_mov_b32 s79, s16
+; SI-NEXT:    v_readfirstlane_b32 s16, v27
+; SI-NEXT:    v_writelane_b32 v41, s14, 9
+; SI-NEXT:    v_writelane_b32 v41, s16, 10
+; SI-NEXT:    v_writelane_b32 v40, s84, 28
+; SI-NEXT:    v_writelane_b32 v40, s85, 29
+; SI-NEXT:    v_writelane_b32 v40, s86, 30
+; SI-NEXT:    v_writelane_b32 v40, s87, 31
+; SI-NEXT:    v_writelane_b32 v40, s96, 32
+; SI-NEXT:    v_writelane_b32 v40, s97, 33
+; SI-NEXT:    v_writelane_b32 v40, s98, 34
+; SI-NEXT:    v_writelane_b32 v40, s99, 35
+; SI-NEXT:    v_readfirstlane_b32 s98, v30
+; SI-NEXT:    v_readfirstlane_b32 s97, v26
+; SI-NEXT:    v_readfirstlane_b32 s96, v22
+; SI-NEXT:    v_readfirstlane_b32 s87, v18
+; SI-NEXT:    v_readfirstlane_b32 s81, v17
+; SI-NEXT:    v_readfirstlane_b32 s86, v14
+; SI-NEXT:    v_readfirstlane_b32 s67, v13
+; SI-NEXT:    v_readfirstlane_b32 s69, v12
+; SI-NEXT:    v_readfirstlane_b32 s71, v11
+; SI-NEXT:    v_readfirstlane_b32 s85, v10
+; SI-NEXT:    v_readfirstlane_b32 s51, v9
+; SI-NEXT:    v_readfirstlane_b32 s53, v8
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_readfirstlane_b32 s89, v31
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_readfirstlane_b32 s91, v32
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_readfirstlane_b32 s93, v33
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:16
+; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_readfirstlane_b32 s55, v34
+; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    v_readfirstlane_b32 s17, v35
+; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    v_readfirstlane_b32 s95, v36
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_readfirstlane_b32 s35, v37
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32
+; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_readfirstlane_b32 s83, v38
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:80
+; SI-NEXT:    v_readfirstlane_b32 s65, v7
+; SI-NEXT:    v_readfirstlane_b32 s84, v6
+; SI-NEXT:    v_readfirstlane_b32 s31, v5
+; SI-NEXT:    v_readfirstlane_b32 s37, v4
+; SI-NEXT:    v_readfirstlane_b32 s49, v3
+; SI-NEXT:    v_readfirstlane_b32 s78, v2
+; SI-NEXT:    v_readfirstlane_b32 s39, v1
+; SI-NEXT:    ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
+; SI-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
+; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_readfirstlane_b32 s77, v31
+; SI-NEXT:    s_waitcnt vmcnt(11)
+; SI-NEXT:    v_readfirstlane_b32 s38, v32
+; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    v_readfirstlane_b32 s48, v33
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_readfirstlane_b32 s50, v39
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_readfirstlane_b32 s76, v48
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_readfirstlane_b32 s30, v49
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_readfirstlane_b32 s34, v50
+; SI-NEXT:    s_waitcnt vmcnt(5)
+; SI-NEXT:    v_readfirstlane_b32 s36, v51
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_readfirstlane_b32 s99, v34
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_readfirstlane_b32 s90, v35
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_readfirstlane_b32 s92, v36
+; SI-NEXT:    v_writelane_b32 v41, s90, 11
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
+; SI-NEXT:    v_readfirstlane_b32 s94, v37
+; SI-NEXT:    v_writelane_b32 v41, s92, 12
+; SI-NEXT:    v_writelane_b32 v41, s94, 13
+; SI-NEXT:    v_writelane_b32 v41, s30, 14
+; SI-NEXT:    v_writelane_b32 v41, s34, 15
+; SI-NEXT:    v_writelane_b32 v41, s36, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_or_b32_e32 v7, v11, v7
-; SI-NEXT:    v_or_b32_e32 v3, v3, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 40, v0
-; SI-NEXT:    buffer_store_dword v3, v7, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 44, v0
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v12
-; SI-NEXT:    s_and_b32 s4, s15, 0xff
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v38
+; SI-NEXT:    v_writelane_b32 v41, s38, 17
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    v_writelane_b32 v41, s48, 18
+; SI-NEXT:    v_writelane_b32 v41, s50, 19
+; SI-NEXT:    s_cbranch_scc0 .LBB99_4
+; SI-NEXT:  ; %bb.1: ; %cmp.false
+; SI-NEXT:    s_and_b32 s4, s79, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s88, 16
+; SI-NEXT:    s_or_b32 s60, s4, s5
+; SI-NEXT:    s_and_b32 s4, s18, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s19, 16
+; SI-NEXT:    s_or_b32 s61, s4, s5
+; SI-NEXT:    s_and_b32 s4, s20, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s21, 16
+; SI-NEXT:    s_or_b32 s56, s4, s5
+; SI-NEXT:    s_and_b32 s4, s22, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s23, 16
+; SI-NEXT:    s_or_b32 s57, s4, s5
+; SI-NEXT:    s_and_b32 s4, s24, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s25, 16
+; SI-NEXT:    s_or_b32 s44, s4, s5
+; SI-NEXT:    s_and_b32 s4, s26, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s27, 16
+; SI-NEXT:    s_or_b32 s45, s4, s5
+; SI-NEXT:    s_and_b32 s4, s28, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s29, 16
+; SI-NEXT:    s_or_b32 s74, s4, s5
+; SI-NEXT:    s_and_b32 s4, s39, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s78, 16
+; SI-NEXT:    s_or_b32 s75, s4, s5
+; SI-NEXT:    s_and_b32 s4, s49, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s37, 16
+; SI-NEXT:    s_or_b32 s72, s4, s5
+; SI-NEXT:    s_and_b32 s4, s31, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s84, 16
+; SI-NEXT:    s_or_b32 s73, s4, s5
+; SI-NEXT:    s_and_b32 s4, s65, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s53, 16
+; SI-NEXT:    s_or_b32 s62, s4, s5
+; SI-NEXT:    s_and_b32 s4, s51, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s85, 16
+; SI-NEXT:    s_or_b32 s63, s4, s5
+; SI-NEXT:    s_and_b32 s4, s71, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s69, 16
+; SI-NEXT:    s_or_b32 s58, s4, s5
+; SI-NEXT:    s_and_b32 s4, s67, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s86, 16
+; SI-NEXT:    s_or_b32 s59, s4, s5
+; SI-NEXT:    s_and_b32 s4, s7, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s6, 16
+; SI-NEXT:    s_or_b32 s46, s4, s5
+; SI-NEXT:    s_and_b32 s4, s81, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s87, 16
+; SI-NEXT:    s_or_b32 s47, s4, s5
+; SI-NEXT:    s_and_b32 s4, s10, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s9, 16
+; SI-NEXT:    s_or_b32 s42, s4, s5
+; SI-NEXT:    s_and_b32 s4, s8, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s96, 16
+; SI-NEXT:    s_or_b32 s43, s4, s5
+; SI-NEXT:    s_and_b32 s4, s13, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s12, 16
+; SI-NEXT:    s_or_b32 s40, s4, s5
+; SI-NEXT:    s_and_b32 s4, s11, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s97, 16
+; SI-NEXT:    s_or_b32 s41, s4, s5
+; SI-NEXT:    s_and_b32 s4, s16, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s14, 16
+; SI-NEXT:    s_or_b32 s14, s4, s5
+; SI-NEXT:    s_and_b32 s4, s15, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s98, 16
+; SI-NEXT:    s_or_b32 s15, s4, s5
+; SI-NEXT:    s_and_b32 s4, s94, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s92, 16
+; SI-NEXT:    s_or_b32 s12, s4, s5
+; SI-NEXT:    s_and_b32 s4, s90, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s99, 16
+; SI-NEXT:    s_or_b32 s13, s4, s5
+; SI-NEXT:    s_and_b32 s4, s36, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s34, 16
+; SI-NEXT:    s_or_b32 s10, s4, s5
+; SI-NEXT:    s_and_b32 s4, s30, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s76, 16
+; SI-NEXT:    s_or_b32 s11, s4, s5
+; SI-NEXT:    s_and_b32 s4, s50, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s48, 16
+; SI-NEXT:    s_or_b32 s8, s4, s5
+; SI-NEXT:    s_and_b32 s4, s38, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s77, 16
+; SI-NEXT:    s_or_b32 s9, s4, s5
+; SI-NEXT:    s_and_b32 s4, s83, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s35, 16
+; SI-NEXT:    s_or_b32 s6, s4, s5
+; SI-NEXT:    s_and_b32 s4, s95, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s17, 16
+; SI-NEXT:    s_or_b32 s7, s4, s5
+; SI-NEXT:    s_and_b32 s4, s55, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s93, 16
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s38, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s15, s50, 24
+; SI-NEXT:    s_and_b32 s5, s91, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s89, 16
+; SI-NEXT:    s_or_b32 s5, s5, s16
+; SI-NEXT:    s_lshr_b32 s16, s61, 8
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_writelane_b32 v43, s16, 20
+; SI-NEXT:    s_lshr_b32 s16, s57, 8
+; SI-NEXT:    v_writelane_b32 v43, s16, 23
+; SI-NEXT:    s_lshr_b32 s16, s45, 8
+; SI-NEXT:    v_writelane_b32 v43, s16, 26
+; SI-NEXT:    s_lshr_b32 s16, s75, 8
+; SI-NEXT:    v_writelane_b32 v43, s16, 29
+; SI-NEXT:    s_lshr_b32 s16, s73, 8
+; SI-NEXT:    s_lshr_b64 vcc, s[60:61], 24
+; SI-NEXT:    v_writelane_b32 v43, s16, 32
+; SI-NEXT:    s_lshr_b32 s16, s63, 8
+; SI-NEXT:    v_writelane_b32 v41, vcc_lo, 22
+; SI-NEXT:    v_writelane_b32 v43, s16, 35
+; SI-NEXT:    s_lshr_b32 s16, s59, 8
+; SI-NEXT:    v_writelane_b32 v41, vcc_hi, 23
+; SI-NEXT:    s_lshr_b64 vcc, s[60:61], 16
+; SI-NEXT:    v_writelane_b32 v43, s16, 38
+; SI-NEXT:    s_lshr_b32 s16, s47, 8
+; SI-NEXT:    v_writelane_b32 v41, vcc_lo, 20
+; SI-NEXT:    v_writelane_b32 v43, s16, 41
+; SI-NEXT:    s_lshr_b32 s16, s43, 8
+; SI-NEXT:    v_writelane_b32 v41, vcc_hi, 21
+; SI-NEXT:    s_lshr_b64 vcc, s[56:57], 24
+; SI-NEXT:    v_writelane_b32 v43, s16, 44
+; SI-NEXT:    s_lshr_b32 s16, s41, 8
+; SI-NEXT:    v_writelane_b32 v41, vcc_lo, 28
+; SI-NEXT:    v_writelane_b32 v43, s16, 47
+; SI-NEXT:    s_lshr_b32 s16, s15, 8
+; SI-NEXT:    v_writelane_b32 v41, vcc_hi, 29
+; SI-NEXT:    s_lshr_b64 vcc, s[56:57], 16
+; SI-NEXT:    v_writelane_b32 v43, s16, 50
+; SI-NEXT:    s_lshr_b32 s16, s13, 8
+; SI-NEXT:    v_writelane_b32 v41, vcc_lo, 26
+; SI-NEXT:    v_writelane_b32 v43, s16, 53
+; SI-NEXT:    s_lshr_b32 s16, s11, 8
+; SI-NEXT:    v_writelane_b32 v41, vcc_hi, 27
+; SI-NEXT:    s_lshr_b64 vcc, s[56:57], 8
+; SI-NEXT:    v_writelane_b32 v43, s16, 56
+; SI-NEXT:    s_lshr_b32 s16, s9, 8
+; SI-NEXT:    v_writelane_b32 v41, vcc_lo, 24
+; SI-NEXT:    v_writelane_b32 v43, s16, 59
+; SI-NEXT:    s_lshr_b32 s16, s7, 8
+; SI-NEXT:    v_writelane_b32 v41, vcc_hi, 25
+; SI-NEXT:    s_lshr_b64 vcc, s[46:47], 24
+; SI-NEXT:    v_writelane_b32 v43, s16, 62
+; SI-NEXT:    s_lshr_b32 s16, s5, 8
+; SI-NEXT:    v_writelane_b32 v41, vcc_lo, 32
+; SI-NEXT:    v_writelane_b32 v42, s16, 1
+; SI-NEXT:    s_and_b32 s16, s19, 0xffff
+; SI-NEXT:    v_writelane_b32 v41, vcc_hi, 33
+; SI-NEXT:    s_lshr_b64 vcc, s[46:47], 16
+; SI-NEXT:    v_writelane_b32 v43, s16, 19
+; SI-NEXT:    s_and_b32 s16, s23, 0xffff
+; SI-NEXT:    v_writelane_b32 v41, vcc_lo, 30
+; SI-NEXT:    v_writelane_b32 v43, s16, 22
+; SI-NEXT:    s_and_b32 s16, s27, 0xffff
+; SI-NEXT:    v_writelane_b32 v41, vcc_hi, 31
+; SI-NEXT:    s_lshr_b64 vcc, s[42:43], 24
+; SI-NEXT:    v_writelane_b32 v43, s16, 25
+; SI-NEXT:    s_and_b32 s16, s78, 0xffff
+; SI-NEXT:    v_writelane_b32 v41, vcc_lo, 38
+; SI-NEXT:    v_writelane_b32 v43, s16, 28
+; SI-NEXT:    s_and_b32 s16, s84, 0xffff
+; SI-NEXT:    v_writelane_b32 v41, vcc_hi, 39
+; SI-NEXT:    s_lshr_b64 vcc, s[42:43], 16
+; SI-NEXT:    v_writelane_b32 v43, s16, 31
+; SI-NEXT:    s_and_b32 s16, s85, 0xffff
+; SI-NEXT:    v_writelane_b32 v41, vcc_lo, 36
+; SI-NEXT:    v_writelane_b32 v43, s16, 34
+; SI-NEXT:    s_and_b32 s16, s86, 0xffff
+; SI-NEXT:    v_writelane_b32 v41, vcc_hi, 37
+; SI-NEXT:    s_lshr_b64 vcc, s[42:43], 8
+; SI-NEXT:    v_writelane_b32 v43, s16, 37
+; SI-NEXT:    s_and_b32 s16, s87, 0xffff
+; SI-NEXT:    v_writelane_b32 v41, vcc_lo, 34
+; SI-NEXT:    v_writelane_b32 v43, s16, 40
+; SI-NEXT:    s_and_b32 s16, s96, 0xffff
+; SI-NEXT:    v_writelane_b32 v41, vcc_hi, 35
+; SI-NEXT:    s_lshr_b64 vcc, s[40:41], 24
+; SI-NEXT:    v_writelane_b32 v43, s16, 43
+; SI-NEXT:    s_and_b32 s16, s97, 0xffff
+; SI-NEXT:    v_writelane_b32 v41, vcc_lo, 44
+; SI-NEXT:    v_writelane_b32 v43, s16, 46
+; SI-NEXT:    s_and_b32 s16, s98, 0xffff
+; SI-NEXT:    v_writelane_b32 v41, vcc_hi, 45
+; SI-NEXT:    s_lshr_b64 vcc, s[40:41], 16
+; SI-NEXT:    v_writelane_b32 v43, s16, 49
+; SI-NEXT:    s_and_b32 s16, s99, 0xffff
+; SI-NEXT:    v_writelane_b32 v41, vcc_lo, 42
+; SI-NEXT:    v_writelane_b32 v43, s16, 52
+; SI-NEXT:    s_and_b32 s16, s76, 0xffff
+; SI-NEXT:    v_writelane_b32 v41, vcc_hi, 43
+; SI-NEXT:    s_lshr_b64 vcc, s[40:41], 8
+; SI-NEXT:    v_writelane_b32 v43, s16, 55
+; SI-NEXT:    s_and_b32 s16, s77, 0xffff
+; SI-NEXT:    v_writelane_b32 v41, vcc_lo, 40
+; SI-NEXT:    v_writelane_b32 v43, s16, 58
+; SI-NEXT:    s_and_b32 s16, s17, 0xffff
+; SI-NEXT:    v_writelane_b32 v41, vcc_hi, 41
+; SI-NEXT:    s_lshr_b64 vcc, s[14:15], 24
+; SI-NEXT:    v_writelane_b32 v43, s16, 61
+; SI-NEXT:    s_and_b32 s16, s89, 0xffff
+; SI-NEXT:    v_writelane_b32 v41, vcc_lo, 50
+; SI-NEXT:    v_writelane_b32 v42, s16, 0
+; SI-NEXT:    s_bfe_u32 s16, s19, 0x80008
+; SI-NEXT:    v_writelane_b32 v41, vcc_hi, 51
+; SI-NEXT:    s_lshr_b64 vcc, s[14:15], 16
+; SI-NEXT:    v_writelane_b32 v43, s16, 18
+; SI-NEXT:    s_bfe_u32 s16, s23, 0x80008
+; SI-NEXT:    v_writelane_b32 v41, vcc_lo, 48
+; SI-NEXT:    v_writelane_b32 v43, s16, 21
+; SI-NEXT:    s_bfe_u32 s16, s27, 0x80008
+; SI-NEXT:    v_writelane_b32 v41, vcc_hi, 49
+; SI-NEXT:    s_lshr_b64 vcc, s[14:15], 8
+; SI-NEXT:    v_writelane_b32 v43, s16, 24
+; SI-NEXT:    s_bfe_u32 s16, s78, 0x80008
+; SI-NEXT:    v_writelane_b32 v41, vcc_lo, 46
+; SI-NEXT:    v_writelane_b32 v43, s16, 27
+; SI-NEXT:    s_bfe_u32 s16, s84, 0x80008
+; SI-NEXT:    v_writelane_b32 v41, vcc_hi, 47
+; SI-NEXT:    s_lshr_b64 vcc, s[12:13], 24
+; SI-NEXT:    v_writelane_b32 v43, s16, 30
+; SI-NEXT:    s_bfe_u32 s16, s85, 0x80008
+; SI-NEXT:    v_writelane_b32 v41, vcc_lo, 56
+; SI-NEXT:    v_writelane_b32 v43, s16, 33
+; SI-NEXT:    s_bfe_u32 s16, s86, 0x80008
+; SI-NEXT:    v_writelane_b32 v41, vcc_hi, 57
+; SI-NEXT:    s_lshr_b64 vcc, s[12:13], 16
+; SI-NEXT:    v_writelane_b32 v43, s16, 36
+; SI-NEXT:    s_bfe_u32 s16, s87, 0x80008
+; SI-NEXT:    v_writelane_b32 v41, vcc_lo, 54
+; SI-NEXT:    v_writelane_b32 v43, s16, 39
+; SI-NEXT:    s_bfe_u32 s16, s96, 0x80008
+; SI-NEXT:    v_writelane_b32 v41, vcc_hi, 55
+; SI-NEXT:    s_lshr_b64 vcc, s[12:13], 8
+; SI-NEXT:    v_writelane_b32 v43, s16, 42
+; SI-NEXT:    s_bfe_u32 s16, s97, 0x80008
+; SI-NEXT:    v_writelane_b32 v41, vcc_lo, 52
+; SI-NEXT:    v_writelane_b32 v43, s16, 45
+; SI-NEXT:    s_bfe_u32 s16, s98, 0x80008
+; SI-NEXT:    v_writelane_b32 v41, vcc_hi, 53
+; SI-NEXT:    s_lshr_b64 vcc, s[10:11], 24
+; SI-NEXT:    v_writelane_b32 v43, s16, 48
+; SI-NEXT:    s_bfe_u32 s16, s99, 0x80008
+; SI-NEXT:    v_writelane_b32 v41, vcc_lo, 62
+; SI-NEXT:    v_writelane_b32 v43, s16, 51
+; SI-NEXT:    s_bfe_u32 s16, s76, 0x80008
+; SI-NEXT:    v_writelane_b32 v41, vcc_hi, 63
+; SI-NEXT:    s_lshr_b64 vcc, s[10:11], 16
+; SI-NEXT:    v_writelane_b32 v43, s16, 54
+; SI-NEXT:    s_bfe_u32 s16, s77, 0x80008
+; SI-NEXT:    v_writelane_b32 v41, vcc_lo, 60
+; SI-NEXT:    v_writelane_b32 v43, s16, 57
+; SI-NEXT:    s_bfe_u32 s16, s17, 0x80008
+; SI-NEXT:    v_writelane_b32 v41, vcc_hi, 61
+; SI-NEXT:    s_lshr_b64 vcc, s[10:11], 8
+; SI-NEXT:    v_writelane_b32 v43, s16, 60
+; SI-NEXT:    s_bfe_u32 s16, s89, 0x80008
+; SI-NEXT:    v_writelane_b32 v41, vcc_lo, 58
+; SI-NEXT:    v_writelane_b32 v43, s16, 63
+; SI-NEXT:    v_writelane_b32 v41, vcc_hi, 59
+; SI-NEXT:    s_lshr_b64 vcc, s[8:9], 24
+; SI-NEXT:    s_mov_b32 s16, s93
+; SI-NEXT:    s_lshr_b64 s[92:93], s[60:61], 8
+; SI-NEXT:    v_writelane_b32 v43, vcc_lo, 4
+; SI-NEXT:    s_mov_b32 s93, s16
+; SI-NEXT:    s_mov_b32 s16, s71
+; SI-NEXT:    s_lshr_b64 s[70:71], s[44:45], 24
+; SI-NEXT:    v_writelane_b32 v43, vcc_hi, 5
+; SI-NEXT:    s_lshr_b64 vcc, s[8:9], 16
+; SI-NEXT:    s_mov_b32 s71, s16
+; SI-NEXT:    s_mov_b32 s16, s81
+; SI-NEXT:    s_lshr_b64 s[80:81], s[44:45], 16
+; SI-NEXT:    v_writelane_b32 v43, vcc_lo, 2
+; SI-NEXT:    s_mov_b32 s81, s16
+; SI-NEXT:    s_mov_b32 s16, s83
+; SI-NEXT:    s_lshr_b64 s[82:83], s[44:45], 8
+; SI-NEXT:    v_writelane_b32 v43, vcc_hi, 3
+; SI-NEXT:    s_lshr_b64 vcc, s[8:9], 8
+; SI-NEXT:    s_mov_b32 s83, s16
+; SI-NEXT:    s_mov_b32 s16, s65
+; SI-NEXT:    s_lshr_b64 s[64:65], s[74:75], 24
+; SI-NEXT:    v_writelane_b32 v43, vcc_lo, 0
+; SI-NEXT:    s_mov_b32 s65, s16
+; SI-NEXT:    s_mov_b32 s16, s67
+; SI-NEXT:    s_lshr_b64 s[66:67], s[74:75], 16
+; SI-NEXT:    v_writelane_b32 v43, vcc_hi, 1
+; SI-NEXT:    s_lshr_b64 vcc, s[6:7], 24
+; SI-NEXT:    s_mov_b32 s67, s16
+; SI-NEXT:    s_mov_b32 s16, s69
+; SI-NEXT:    s_lshr_b64 s[68:69], s[74:75], 8
+; SI-NEXT:    v_writelane_b32 v43, vcc_lo, 10
+; SI-NEXT:    s_mov_b32 s69, s16
+; SI-NEXT:    s_mov_b32 s16, s51
+; SI-NEXT:    s_lshr_b64 s[50:51], s[72:73], 24
+; SI-NEXT:    v_writelane_b32 v43, vcc_hi, 11
+; SI-NEXT:    s_lshr_b64 vcc, s[6:7], 16
+; SI-NEXT:    s_mov_b32 s51, s16
+; SI-NEXT:    s_mov_b32 s16, s53
+; SI-NEXT:    s_lshr_b64 s[52:53], s[72:73], 16
+; SI-NEXT:    v_writelane_b32 v43, vcc_lo, 8
+; SI-NEXT:    s_mov_b32 s53, s16
+; SI-NEXT:    s_mov_b32 s16, s55
+; SI-NEXT:    s_lshr_b64 s[54:55], s[72:73], 8
+; SI-NEXT:    v_writelane_b32 v43, vcc_hi, 9
+; SI-NEXT:    s_lshr_b64 vcc, s[6:7], 8
+; SI-NEXT:    s_mov_b32 s55, s16
+; SI-NEXT:    s_mov_b32 s16, s37
+; SI-NEXT:    s_lshr_b64 s[36:37], s[62:63], 24
+; SI-NEXT:    v_writelane_b32 v43, vcc_lo, 6
+; SI-NEXT:    s_mov_b32 s37, s16
+; SI-NEXT:    s_mov_b32 s16, s39
+; SI-NEXT:    s_lshr_b64 s[38:39], s[62:63], 16
+; SI-NEXT:    v_writelane_b32 v43, vcc_hi, 7
+; SI-NEXT:    s_lshr_b64 vcc, s[4:5], 24
+; SI-NEXT:    s_mov_b32 s39, s16
+; SI-NEXT:    s_mov_b32 s16, s49
+; SI-NEXT:    s_lshr_b64 s[48:49], s[62:63], 8
+; SI-NEXT:    v_writelane_b32 v43, vcc_lo, 16
+; SI-NEXT:    s_mov_b32 s49, s16
+; SI-NEXT:    s_mov_b32 s16, s95
+; SI-NEXT:    s_lshr_b64 s[94:95], s[58:59], 24
+; SI-NEXT:    v_writelane_b32 v43, vcc_hi, 17
+; SI-NEXT:    s_lshr_b64 vcc, s[4:5], 16
+; SI-NEXT:    s_mov_b32 s95, s16
+; SI-NEXT:    s_mov_b32 s16, s31
+; SI-NEXT:    s_lshr_b64 s[30:31], s[58:59], 16
+; SI-NEXT:    v_writelane_b32 v43, vcc_lo, 14
+; SI-NEXT:    s_mov_b32 s31, s16
+; SI-NEXT:    s_mov_b32 s16, s35
+; SI-NEXT:    s_lshr_b64 s[34:35], s[58:59], 8
+; SI-NEXT:    v_writelane_b32 v43, vcc_hi, 15
+; SI-NEXT:    s_lshr_b64 vcc, s[4:5], 8
+; SI-NEXT:    s_mov_b32 s35, s16
+; SI-NEXT:    s_mov_b32 s16, s91
+; SI-NEXT:    s_lshr_b64 s[90:91], s[46:47], 8
+; SI-NEXT:    v_writelane_b32 v43, vcc_lo, 12
+; SI-NEXT:    s_mov_b32 s91, s16
+; SI-NEXT:    v_writelane_b32 v43, vcc_hi, 13
+; SI-NEXT:    s_cbranch_execnz .LBB99_3
+; SI-NEXT:  .LBB99_2: ; %cmp.true
+; SI-NEXT:    s_add_i32 s4, s55, 3
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s15, s5
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_lshl_b32 s5, s95, 8
-; SI-NEXT:    v_readlane_b32 s50, v63, 10
-; SI-NEXT:    v_readlane_b32 s38, v63, 6
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; SI-NEXT:    v_or_b32_e32 v3, v3, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_or_b32_e32 v7, v11, v7
-; SI-NEXT:    v_or_b32_e32 v3, v3, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 48, v0
-; SI-NEXT:    buffer_store_dword v3, v7, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
+; SI-NEXT:    s_lshl_b32 s5, s93, 16
+; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    s_add_i32 s5, s91, 3
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_lshl_b32 s6, s89, 16
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    s_add_i32 s6, s83, 3
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_lshl_b32 s7, s35, 16
+; SI-NEXT:    s_or_b32 s6, s7, s6
+; SI-NEXT:    s_add_i32 s7, s95, 3
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_lshl_b32 s8, s17, 16
+; SI-NEXT:    s_or_b32 s7, s8, s7
+; SI-NEXT:    v_readlane_b32 s8, v41, 19
+; SI-NEXT:    s_add_i32 s8, s8, 3
+; SI-NEXT:    v_readlane_b32 s9, v41, 18
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_or_b32 s8, s9, s8
+; SI-NEXT:    v_readlane_b32 s9, v41, 17
+; SI-NEXT:    s_add_i32 s9, s9, 3
+; SI-NEXT:    s_and_b32 s9, s9, 0xffff
+; SI-NEXT:    s_lshl_b32 s10, s77, 16
+; SI-NEXT:    s_or_b32 s9, s10, s9
+; SI-NEXT:    v_readlane_b32 s10, v41, 16
+; SI-NEXT:    s_add_i32 s10, s10, 3
+; SI-NEXT:    v_readlane_b32 s11, v41, 15
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s11, 16
+; SI-NEXT:    s_or_b32 s10, s11, s10
+; SI-NEXT:    v_readlane_b32 s11, v41, 14
+; SI-NEXT:    s_add_i32 s11, s11, 3
+; SI-NEXT:    s_and_b32 s11, s11, 0xffff
+; SI-NEXT:    s_lshl_b32 s12, s76, 16
+; SI-NEXT:    s_or_b32 s11, s12, s11
+; SI-NEXT:    v_readlane_b32 s12, v41, 13
+; SI-NEXT:    s_add_i32 s12, s12, 3
+; SI-NEXT:    v_readlane_b32 s13, v41, 12
+; SI-NEXT:    s_and_b32 s12, s12, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s13, 16
+; SI-NEXT:    s_or_b32 s12, s13, s12
+; SI-NEXT:    v_readlane_b32 s13, v41, 11
+; SI-NEXT:    s_add_i32 s13, s13, 3
+; SI-NEXT:    s_and_b32 s13, s13, 0xffff
+; SI-NEXT:    s_lshl_b32 s14, s99, 16
+; SI-NEXT:    s_or_b32 s13, s14, s13
+; SI-NEXT:    v_readlane_b32 s14, v41, 10
+; SI-NEXT:    s_add_i32 s14, s14, 3
+; SI-NEXT:    v_readlane_b32 s15, v41, 9
+; SI-NEXT:    s_and_b32 s14, s14, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s15, 16
+; SI-NEXT:    s_or_b32 s14, s15, s14
+; SI-NEXT:    v_readlane_b32 s15, v41, 8
+; SI-NEXT:    s_add_i32 s15, s15, 3
+; SI-NEXT:    s_and_b32 s15, s15, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s98, 16
+; SI-NEXT:    s_or_b32 s15, s16, s15
+; SI-NEXT:    v_readlane_b32 s16, v41, 7
+; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    v_readlane_b32 s17, v41, 6
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s40, s16, 0x30000
+; SI-NEXT:    v_readlane_b32 s16, v41, 5
+; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s97, 16
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s41, s16, 0x30000
+; SI-NEXT:    v_readlane_b32 s16, v41, 4
+; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    v_readlane_b32 s17, v41, 3
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s42, s16, 0x30000
+; SI-NEXT:    v_readlane_b32 s16, v41, 2
+; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s96, 16
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s43, s16, 0x30000
+; SI-NEXT:    v_readlane_b32 s16, v41, 1
+; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    v_readlane_b32 s17, v41, 0
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s46, s16, 0x30000
+; SI-NEXT:    s_add_i32 s16, s81, 3
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s87, 16
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s47, s16, 0x30000
+; SI-NEXT:    s_add_i32 s16, s71, 3
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s69, 16
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s58, s16, 0x30000
+; SI-NEXT:    s_add_i32 s16, s67, 3
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s86, 16
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s59, s16, 0x30000
+; SI-NEXT:    s_add_i32 s16, s65, 3
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s53, 16
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s62, s16, 0x30000
+; SI-NEXT:    s_add_i32 s16, s51, 3
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s85, 16
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s63, s16, 0x30000
+; SI-NEXT:    s_add_i32 s16, s49, 3
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s37, 16
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s72, s16, 0x30000
+; SI-NEXT:    s_add_i32 s16, s31, 3
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s84, 16
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s73, s16, 0x30000
+; SI-NEXT:    s_add_i32 s16, s28, 3
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s29, 16
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s74, s16, 0x30000
+; SI-NEXT:    s_add_i32 s16, s39, 3
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s78, 16
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s75, s16, 0x30000
+; SI-NEXT:    s_add_i32 s16, s24, 3
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s25, 16
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s44, s16, 0x30000
+; SI-NEXT:    s_add_i32 s16, s26, 3
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s27, 16
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s45, s16, 0x30000
+; SI-NEXT:    s_add_i32 s16, s20, 3
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s21, 16
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s56, s16, 0x30000
+; SI-NEXT:    s_add_i32 s16, s22, 3
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s23, 16
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s57, s16, 0x30000
+; SI-NEXT:    s_add_i32 s16, s79, 3
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s88, 16
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s60, s16, 0x30000
+; SI-NEXT:    s_add_i32 s16, s18, 3
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s19, 16
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s61, s16, 0x30000
+; SI-NEXT:    s_lshr_b64 s[16:17], s[60:61], 24
+; SI-NEXT:    v_writelane_b32 v41, s16, 22
+; SI-NEXT:    v_writelane_b32 v41, s17, 23
+; SI-NEXT:    s_lshr_b64 s[16:17], s[60:61], 16
+; SI-NEXT:    v_writelane_b32 v41, s16, 20
+; SI-NEXT:    v_writelane_b32 v41, s17, 21
+; SI-NEXT:    s_lshr_b32 s16, s61, 24
+; SI-NEXT:    v_writelane_b32 v43, s16, 18
+; SI-NEXT:    s_lshr_b32 s16, s61, 16
+; SI-NEXT:    v_writelane_b32 v43, s16, 19
+; SI-NEXT:    s_lshr_b32 s16, s61, 8
+; SI-NEXT:    v_writelane_b32 v43, s16, 20
+; SI-NEXT:    s_lshr_b32 s16, s57, 24
+; SI-NEXT:    v_writelane_b32 v43, s16, 21
+; SI-NEXT:    s_lshr_b32 s16, s57, 16
+; SI-NEXT:    v_writelane_b32 v43, s16, 22
+; SI-NEXT:    s_lshr_b32 s16, s57, 8
+; SI-NEXT:    v_writelane_b32 v43, s16, 23
+; SI-NEXT:    s_lshr_b32 s16, s45, 24
+; SI-NEXT:    v_writelane_b32 v43, s16, 24
+; SI-NEXT:    s_lshr_b32 s16, s45, 16
+; SI-NEXT:    v_writelane_b32 v43, s16, 25
+; SI-NEXT:    s_lshr_b32 s16, s45, 8
+; SI-NEXT:    v_writelane_b32 v43, s16, 26
+; SI-NEXT:    s_lshr_b32 s16, s75, 24
+; SI-NEXT:    v_writelane_b32 v43, s16, 27
+; SI-NEXT:    s_lshr_b32 s16, s75, 16
+; SI-NEXT:    v_writelane_b32 v43, s16, 28
+; SI-NEXT:    s_lshr_b32 s16, s75, 8
+; SI-NEXT:    v_writelane_b32 v43, s16, 29
+; SI-NEXT:    s_lshr_b32 s16, s73, 24
+; SI-NEXT:    v_writelane_b32 v43, s16, 30
+; SI-NEXT:    s_lshr_b32 s16, s73, 16
+; SI-NEXT:    v_writelane_b32 v43, s16, 31
+; SI-NEXT:    s_lshr_b32 s16, s73, 8
+; SI-NEXT:    v_writelane_b32 v43, s16, 32
+; SI-NEXT:    s_lshr_b32 s16, s63, 24
+; SI-NEXT:    v_writelane_b32 v43, s16, 33
+; SI-NEXT:    s_lshr_b32 s16, s63, 16
+; SI-NEXT:    v_writelane_b32 v43, s16, 34
+; SI-NEXT:    s_lshr_b32 s16, s63, 8
+; SI-NEXT:    v_writelane_b32 v43, s16, 35
+; SI-NEXT:    s_lshr_b32 s16, s59, 24
+; SI-NEXT:    v_writelane_b32 v43, s16, 36
+; SI-NEXT:    s_lshr_b32 s16, s59, 16
+; SI-NEXT:    v_writelane_b32 v43, s16, 37
+; SI-NEXT:    s_lshr_b32 s16, s59, 8
+; SI-NEXT:    v_writelane_b32 v43, s16, 38
+; SI-NEXT:    s_lshr_b32 s16, s47, 24
+; SI-NEXT:    v_writelane_b32 v43, s16, 39
+; SI-NEXT:    s_lshr_b32 s16, s47, 16
+; SI-NEXT:    v_writelane_b32 v43, s16, 40
+; SI-NEXT:    s_lshr_b32 s16, s47, 8
+; SI-NEXT:    v_writelane_b32 v43, s16, 41
+; SI-NEXT:    s_lshr_b32 s16, s43, 24
+; SI-NEXT:    v_writelane_b32 v43, s16, 42
+; SI-NEXT:    s_lshr_b32 s16, s43, 16
+; SI-NEXT:    v_writelane_b32 v43, s16, 43
+; SI-NEXT:    s_lshr_b32 s16, s43, 8
+; SI-NEXT:    v_writelane_b32 v43, s16, 44
+; SI-NEXT:    s_lshr_b32 s16, s41, 24
+; SI-NEXT:    v_writelane_b32 v43, s16, 45
+; SI-NEXT:    s_lshr_b32 s16, s41, 16
+; SI-NEXT:    s_add_i32 s15, s15, 0x30000
+; SI-NEXT:    v_writelane_b32 v43, s16, 46
+; SI-NEXT:    s_lshr_b32 s16, s41, 8
+; SI-NEXT:    v_writelane_b32 v43, s16, 47
+; SI-NEXT:    s_lshr_b32 s16, s15, 24
+; SI-NEXT:    v_writelane_b32 v43, s16, 48
+; SI-NEXT:    s_lshr_b32 s16, s15, 16
+; SI-NEXT:    s_add_i32 s13, s13, 0x30000
+; SI-NEXT:    v_writelane_b32 v43, s16, 49
+; SI-NEXT:    s_lshr_b32 s16, s15, 8
+; SI-NEXT:    v_writelane_b32 v43, s16, 50
+; SI-NEXT:    s_lshr_b32 s16, s13, 24
+; SI-NEXT:    v_writelane_b32 v43, s16, 51
+; SI-NEXT:    s_lshr_b32 s16, s13, 16
+; SI-NEXT:    s_add_i32 s11, s11, 0x30000
+; SI-NEXT:    v_writelane_b32 v43, s16, 52
+; SI-NEXT:    s_lshr_b32 s16, s13, 8
+; SI-NEXT:    v_writelane_b32 v43, s16, 53
+; SI-NEXT:    s_lshr_b32 s16, s11, 24
+; SI-NEXT:    v_writelane_b32 v43, s16, 54
+; SI-NEXT:    s_lshr_b32 s16, s11, 16
+; SI-NEXT:    s_add_i32 s9, s9, 0x30000
+; SI-NEXT:    v_writelane_b32 v43, s16, 55
+; SI-NEXT:    s_lshr_b32 s16, s11, 8
+; SI-NEXT:    v_writelane_b32 v43, s16, 56
+; SI-NEXT:    s_lshr_b32 s16, s9, 24
+; SI-NEXT:    v_writelane_b32 v43, s16, 57
+; SI-NEXT:    s_lshr_b32 s16, s9, 16
+; SI-NEXT:    s_add_i32 s7, s7, 0x30000
+; SI-NEXT:    v_writelane_b32 v43, s16, 58
+; SI-NEXT:    s_lshr_b32 s16, s9, 8
+; SI-NEXT:    v_writelane_b32 v43, s16, 59
+; SI-NEXT:    s_lshr_b32 s16, s7, 24
+; SI-NEXT:    v_writelane_b32 v43, s16, 60
+; SI-NEXT:    s_lshr_b32 s16, s7, 16
+; SI-NEXT:    s_add_i32 s5, s5, 0x30000
+; SI-NEXT:    v_writelane_b32 v43, s16, 61
+; SI-NEXT:    s_lshr_b32 s16, s7, 8
+; SI-NEXT:    v_writelane_b32 v43, s16, 62
+; SI-NEXT:    s_lshr_b32 s16, s5, 24
+; SI-NEXT:    v_writelane_b32 v43, s16, 63
+; SI-NEXT:    s_lshr_b32 s16, s5, 16
+; SI-NEXT:    v_writelane_b32 v42, s16, 0
+; SI-NEXT:    s_lshr_b32 s16, s5, 8
+; SI-NEXT:    v_writelane_b32 v42, s16, 1
+; SI-NEXT:    s_lshr_b64 s[16:17], s[56:57], 24
+; SI-NEXT:    v_writelane_b32 v41, s16, 28
+; SI-NEXT:    v_writelane_b32 v41, s17, 29
+; SI-NEXT:    s_lshr_b64 s[16:17], s[56:57], 16
+; SI-NEXT:    v_writelane_b32 v41, s16, 26
+; SI-NEXT:    v_writelane_b32 v41, s17, 27
+; SI-NEXT:    s_lshr_b64 s[16:17], s[56:57], 8
+; SI-NEXT:    v_writelane_b32 v41, s16, 24
+; SI-NEXT:    v_writelane_b32 v41, s17, 25
+; SI-NEXT:    s_lshr_b64 s[16:17], s[46:47], 24
+; SI-NEXT:    v_writelane_b32 v41, s16, 32
+; SI-NEXT:    v_writelane_b32 v41, s17, 33
+; SI-NEXT:    s_lshr_b64 s[16:17], s[46:47], 16
+; SI-NEXT:    v_writelane_b32 v41, s16, 30
+; SI-NEXT:    v_writelane_b32 v41, s17, 31
+; SI-NEXT:    s_lshr_b64 s[16:17], s[42:43], 24
+; SI-NEXT:    v_writelane_b32 v41, s16, 38
+; SI-NEXT:    v_writelane_b32 v41, s17, 39
+; SI-NEXT:    s_lshr_b64 s[16:17], s[42:43], 16
+; SI-NEXT:    v_writelane_b32 v41, s16, 36
+; SI-NEXT:    v_writelane_b32 v41, s17, 37
+; SI-NEXT:    s_lshr_b64 s[16:17], s[42:43], 8
+; SI-NEXT:    v_writelane_b32 v41, s16, 34
+; SI-NEXT:    v_writelane_b32 v41, s17, 35
+; SI-NEXT:    s_lshr_b64 s[16:17], s[40:41], 24
+; SI-NEXT:    v_writelane_b32 v41, s16, 44
+; SI-NEXT:    v_writelane_b32 v41, s17, 45
+; SI-NEXT:    s_lshr_b64 s[16:17], s[40:41], 16
+; SI-NEXT:    v_writelane_b32 v41, s16, 42
+; SI-NEXT:    v_writelane_b32 v41, s17, 43
+; SI-NEXT:    s_lshr_b64 s[16:17], s[40:41], 8
+; SI-NEXT:    s_add_i32 s14, s14, 0x30000
+; SI-NEXT:    v_writelane_b32 v41, s16, 40
+; SI-NEXT:    v_writelane_b32 v41, s17, 41
+; SI-NEXT:    s_lshr_b64 s[16:17], s[14:15], 24
+; SI-NEXT:    v_writelane_b32 v41, s16, 50
+; SI-NEXT:    v_writelane_b32 v41, s17, 51
+; SI-NEXT:    s_lshr_b64 s[16:17], s[14:15], 16
+; SI-NEXT:    v_writelane_b32 v41, s16, 48
+; SI-NEXT:    v_writelane_b32 v41, s17, 49
+; SI-NEXT:    s_lshr_b64 s[16:17], s[14:15], 8
+; SI-NEXT:    s_add_i32 s12, s12, 0x30000
+; SI-NEXT:    v_writelane_b32 v41, s16, 46
+; SI-NEXT:    v_writelane_b32 v41, s17, 47
+; SI-NEXT:    s_lshr_b64 s[16:17], s[12:13], 24
+; SI-NEXT:    v_writelane_b32 v41, s16, 56
+; SI-NEXT:    v_writelane_b32 v41, s17, 57
+; SI-NEXT:    s_lshr_b64 s[16:17], s[12:13], 16
+; SI-NEXT:    v_writelane_b32 v41, s16, 54
+; SI-NEXT:    v_writelane_b32 v41, s17, 55
+; SI-NEXT:    s_lshr_b64 s[16:17], s[12:13], 8
+; SI-NEXT:    s_add_i32 s10, s10, 0x30000
+; SI-NEXT:    v_writelane_b32 v41, s16, 52
+; SI-NEXT:    v_writelane_b32 v41, s17, 53
+; SI-NEXT:    s_lshr_b64 s[16:17], s[10:11], 24
+; SI-NEXT:    v_writelane_b32 v41, s16, 62
+; SI-NEXT:    v_writelane_b32 v41, s17, 63
+; SI-NEXT:    s_lshr_b64 s[16:17], s[10:11], 16
+; SI-NEXT:    v_writelane_b32 v41, s16, 60
+; SI-NEXT:    v_writelane_b32 v41, s17, 61
+; SI-NEXT:    s_lshr_b64 s[16:17], s[10:11], 8
+; SI-NEXT:    s_add_i32 s8, s8, 0x30000
+; SI-NEXT:    v_writelane_b32 v41, s16, 58
+; SI-NEXT:    v_writelane_b32 v41, s17, 59
+; SI-NEXT:    s_lshr_b64 s[16:17], s[8:9], 24
+; SI-NEXT:    v_writelane_b32 v43, s16, 4
+; SI-NEXT:    v_writelane_b32 v43, s17, 5
+; SI-NEXT:    s_lshr_b64 s[16:17], s[8:9], 16
+; SI-NEXT:    v_writelane_b32 v43, s16, 2
+; SI-NEXT:    v_writelane_b32 v43, s17, 3
+; SI-NEXT:    s_lshr_b64 s[16:17], s[8:9], 8
+; SI-NEXT:    s_add_i32 s6, s6, 0x30000
+; SI-NEXT:    v_writelane_b32 v43, s16, 0
+; SI-NEXT:    v_writelane_b32 v43, s17, 1
+; SI-NEXT:    s_lshr_b64 s[16:17], s[6:7], 24
+; SI-NEXT:    v_writelane_b32 v43, s16, 10
+; SI-NEXT:    v_writelane_b32 v43, s17, 11
+; SI-NEXT:    s_lshr_b64 s[16:17], s[6:7], 16
+; SI-NEXT:    v_writelane_b32 v43, s16, 8
+; SI-NEXT:    v_writelane_b32 v43, s17, 9
+; SI-NEXT:    s_lshr_b64 s[16:17], s[6:7], 8
+; SI-NEXT:    s_add_i32 s4, s4, 0x30000
+; SI-NEXT:    v_writelane_b32 v43, s16, 6
+; SI-NEXT:    v_writelane_b32 v43, s17, 7
+; SI-NEXT:    s_lshr_b64 s[16:17], s[4:5], 24
+; SI-NEXT:    v_writelane_b32 v43, s16, 16
+; SI-NEXT:    v_writelane_b32 v43, s17, 17
+; SI-NEXT:    s_lshr_b64 s[16:17], s[4:5], 16
+; SI-NEXT:    v_writelane_b32 v43, s16, 14
+; SI-NEXT:    v_writelane_b32 v43, s17, 15
+; SI-NEXT:    s_lshr_b64 s[16:17], s[4:5], 8
+; SI-NEXT:    s_lshr_b64 s[92:93], s[60:61], 8
+; SI-NEXT:    s_lshr_b64 s[70:71], s[44:45], 24
+; SI-NEXT:    s_lshr_b64 s[80:81], s[44:45], 16
+; SI-NEXT:    s_lshr_b64 s[82:83], s[44:45], 8
+; SI-NEXT:    s_lshr_b64 s[64:65], s[74:75], 24
+; SI-NEXT:    s_lshr_b64 s[66:67], s[74:75], 16
+; SI-NEXT:    s_lshr_b64 s[68:69], s[74:75], 8
+; SI-NEXT:    s_lshr_b64 s[50:51], s[72:73], 24
+; SI-NEXT:    s_lshr_b64 s[52:53], s[72:73], 16
+; SI-NEXT:    s_lshr_b64 s[54:55], s[72:73], 8
+; SI-NEXT:    s_lshr_b64 s[36:37], s[62:63], 24
+; SI-NEXT:    s_lshr_b64 s[38:39], s[62:63], 16
+; SI-NEXT:    s_lshr_b64 s[48:49], s[62:63], 8
+; SI-NEXT:    s_lshr_b64 s[94:95], s[58:59], 24
+; SI-NEXT:    s_lshr_b64 s[30:31], s[58:59], 16
+; SI-NEXT:    s_lshr_b64 s[34:35], s[58:59], 8
+; SI-NEXT:    s_lshr_b64 s[90:91], s[46:47], 8
+; SI-NEXT:    v_writelane_b32 v43, s16, 12
+; SI-NEXT:    v_writelane_b32 v43, s17, 13
+; SI-NEXT:  .LBB99_3: ; %end
+; SI-NEXT:    s_lshl_b32 s17, s92, 8
+; SI-NEXT:    s_and_b32 s18, s60, 0xff
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    v_readlane_b32 s18, v41, 20
+; SI-NEXT:    v_readlane_b32 s19, v41, 21
+; SI-NEXT:    s_and_b32 s18, s18, 0xff
+; SI-NEXT:    v_readlane_b32 s20, v41, 22
+; SI-NEXT:    s_lshl_b32 s19, s20, 24
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_or_b32 s18, s19, s18
+; SI-NEXT:    s_and_b32 s17, s17, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    v_readlane_b32 s16, v43, 20
+; SI-NEXT:    v_mov_b32_e32 v1, s17
+; SI-NEXT:    s_and_b32 s17, s61, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s16, 8
+; SI-NEXT:    v_readlane_b32 s16, v43, 19
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    s_and_b32 s18, s16, 0xff
+; SI-NEXT:    v_readlane_b32 s16, v43, 18
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_lshl_b32 s19, s16, 24
+; SI-NEXT:    s_or_b32 s18, s19, s18
+; SI-NEXT:    s_and_b32 s17, s17, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    v_mov_b32_e32 v2, s17
+; SI-NEXT:    v_readlane_b32 s16, v41, 24
+; SI-NEXT:    v_readlane_b32 s17, v41, 25
+; SI-NEXT:    s_lshl_b32 s17, s16, 8
+; SI-NEXT:    s_and_b32 s18, s56, 0xff
+; SI-NEXT:    v_readlane_b32 s21, v41, 23
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    v_readlane_b32 s18, v41, 26
+; SI-NEXT:    v_readlane_b32 s19, v41, 27
+; SI-NEXT:    s_and_b32 s18, s18, 0xff
+; SI-NEXT:    v_readlane_b32 s20, v41, 28
+; SI-NEXT:    s_lshl_b32 s19, s20, 24
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_or_b32 s18, s19, s18
+; SI-NEXT:    s_and_b32 s17, s17, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    v_readlane_b32 s16, v43, 23
+; SI-NEXT:    v_mov_b32_e32 v3, s17
+; SI-NEXT:    s_and_b32 s17, s57, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s16, 8
+; SI-NEXT:    v_readlane_b32 s16, v43, 22
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    s_and_b32 s18, s16, 0xff
+; SI-NEXT:    v_readlane_b32 s16, v43, 21
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_lshl_b32 s19, s16, 24
+; SI-NEXT:    s_or_b32 s18, s19, s18
+; SI-NEXT:    s_and_b32 s17, s17, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    v_mov_b32_e32 v4, s17
+; SI-NEXT:    s_lshl_b32 s17, s82, 8
+; SI-NEXT:    s_and_b32 s18, s44, 0xff
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s18, s80, 0xff
+; SI-NEXT:    s_lshl_b32 s19, s70, 24
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_or_b32 s18, s19, s18
+; SI-NEXT:    s_and_b32 s17, s17, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    v_readlane_b32 s16, v43, 26
+; SI-NEXT:    v_mov_b32_e32 v5, s17
+; SI-NEXT:    s_and_b32 s17, s45, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s16, 8
+; SI-NEXT:    v_readlane_b32 s16, v43, 25
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    s_and_b32 s18, s16, 0xff
+; SI-NEXT:    v_readlane_b32 s16, v43, 24
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_lshl_b32 s19, s16, 24
+; SI-NEXT:    s_or_b32 s18, s19, s18
+; SI-NEXT:    s_and_b32 s17, s17, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    v_mov_b32_e32 v6, s17
+; SI-NEXT:    s_lshl_b32 s17, s68, 8
+; SI-NEXT:    s_and_b32 s18, s74, 0xff
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s18, s66, 0xff
+; SI-NEXT:    s_lshl_b32 s19, s64, 24
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_or_b32 s18, s19, s18
+; SI-NEXT:    s_and_b32 s17, s17, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    v_readlane_b32 s16, v43, 29
+; SI-NEXT:    v_mov_b32_e32 v7, s17
+; SI-NEXT:    s_and_b32 s17, s75, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s16, 8
+; SI-NEXT:    v_readlane_b32 s16, v43, 28
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    s_and_b32 s18, s16, 0xff
+; SI-NEXT:    v_readlane_b32 s16, v43, 27
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_lshl_b32 s19, s16, 24
+; SI-NEXT:    s_or_b32 s18, s19, s18
+; SI-NEXT:    s_and_b32 s17, s17, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    v_mov_b32_e32 v8, s17
+; SI-NEXT:    s_lshl_b32 s17, s54, 8
+; SI-NEXT:    s_and_b32 s18, s72, 0xff
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s18, s52, 0xff
+; SI-NEXT:    s_lshl_b32 s19, s50, 24
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_or_b32 s18, s19, s18
+; SI-NEXT:    s_and_b32 s17, s17, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    v_readlane_b32 s16, v43, 32
+; SI-NEXT:    v_mov_b32_e32 v9, s17
+; SI-NEXT:    s_and_b32 s17, s73, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s16, 8
+; SI-NEXT:    v_readlane_b32 s16, v43, 31
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    s_and_b32 s18, s16, 0xff
+; SI-NEXT:    v_readlane_b32 s16, v43, 30
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_lshl_b32 s19, s16, 24
+; SI-NEXT:    s_or_b32 s18, s19, s18
+; SI-NEXT:    s_and_b32 s17, s17, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    v_mov_b32_e32 v10, s17
+; SI-NEXT:    s_lshl_b32 s17, s48, 8
+; SI-NEXT:    s_and_b32 s18, s62, 0xff
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s18, s38, 0xff
+; SI-NEXT:    s_lshl_b32 s19, s36, 24
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_or_b32 s18, s19, s18
+; SI-NEXT:    s_and_b32 s17, s17, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    v_readlane_b32 s16, v43, 35
+; SI-NEXT:    v_mov_b32_e32 v11, s17
+; SI-NEXT:    s_and_b32 s17, s63, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s16, 8
+; SI-NEXT:    v_readlane_b32 s16, v43, 34
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    s_and_b32 s18, s16, 0xff
+; SI-NEXT:    v_readlane_b32 s16, v43, 33
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_lshl_b32 s19, s16, 24
+; SI-NEXT:    s_or_b32 s18, s19, s18
+; SI-NEXT:    s_and_b32 s17, s17, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    v_mov_b32_e32 v12, s17
+; SI-NEXT:    s_lshl_b32 s17, s34, 8
+; SI-NEXT:    s_and_b32 s18, s58, 0xff
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s18, s30, 0xff
+; SI-NEXT:    s_lshl_b32 s19, s94, 24
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_or_b32 s18, s19, s18
+; SI-NEXT:    s_and_b32 s17, s17, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    v_readlane_b32 s16, v43, 38
+; SI-NEXT:    v_mov_b32_e32 v13, s17
+; SI-NEXT:    s_and_b32 s17, s59, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s16, 8
+; SI-NEXT:    v_readlane_b32 s16, v43, 37
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    s_and_b32 s18, s16, 0xff
+; SI-NEXT:    v_readlane_b32 s16, v43, 36
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_lshl_b32 s19, s16, 24
+; SI-NEXT:    s_or_b32 s18, s19, s18
+; SI-NEXT:    s_and_b32 s17, s17, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    v_mov_b32_e32 v14, s17
+; SI-NEXT:    s_lshl_b32 s17, s90, 8
+; SI-NEXT:    s_and_b32 s18, s46, 0xff
+; SI-NEXT:    v_readlane_b32 s21, v41, 29
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    v_readlane_b32 s18, v41, 30
+; SI-NEXT:    v_readlane_b32 s19, v41, 31
+; SI-NEXT:    s_and_b32 s18, s18, 0xff
+; SI-NEXT:    v_readlane_b32 s20, v41, 32
+; SI-NEXT:    s_lshl_b32 s19, s20, 24
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_or_b32 s18, s19, s18
+; SI-NEXT:    s_and_b32 s17, s17, 0xffff
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    v_readlane_b32 s16, v43, 41
+; SI-NEXT:    v_mov_b32_e32 v15, s17
+; SI-NEXT:    s_and_b32 s17, s47, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s16, 8
+; SI-NEXT:    v_readlane_b32 s16, v43, 40
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    s_and_b32 s18, s16, 0xff
+; SI-NEXT:    v_readlane_b32 s16, v43, 39
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_lshl_b32 s19, s16, 24
+; SI-NEXT:    s_or_b32 s18, s19, s18
+; SI-NEXT:    s_and_b32 s17, s17, 0xffff
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v10
-; SI-NEXT:    s_and_b32 s4, s14, 0xff
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s35, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s14, s39, 24
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s14, s5
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_lshl_b32 s5, s92, 8
-; SI-NEXT:    v_readlane_b32 s39, v63, 7
-; SI-NEXT:    v_readlane_b32 s35, v63, 3
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; SI-NEXT:    v_or_b32_e32 v3, v3, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_or_b32_e32 v7, v10, v7
-; SI-NEXT:    v_or_b32_e32 v3, v3, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 56, v0
-; SI-NEXT:    buffer_store_dword v3, v7, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; SI-NEXT:    s_or_b32 s17, s17, s18
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; SI-NEXT:    v_mov_b32_e32 v16, s17
+; SI-NEXT:    v_readlane_b32 s16, v41, 34
+; SI-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; SI-NEXT:    v_readlane_b32 s17, v41, 35
+; SI-NEXT:    v_readlane_b32 s18, v41, 36
+; SI-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; SI-NEXT:    s_lshl_b32 s16, s16, 8
+; SI-NEXT:    s_and_b32 s17, s42, 0xff
+; SI-NEXT:    v_readlane_b32 s19, v41, 37
+; SI-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 20, v0
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_and_b32 s17, s18, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v41, 38
+; SI-NEXT:    buffer_store_dword v6, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 24, v0
+; SI-NEXT:    s_lshl_b32 s18, s18, 24
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v9, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 36, v0
+; SI-NEXT:    v_readlane_b32 s17, v43, 44
+; SI-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 40, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s43, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 44, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v43, 43
+; SI-NEXT:    buffer_store_dword v12, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 48, v0
+; SI-NEXT:    s_and_b32 s17, s17, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v43, 42
+; SI-NEXT:    v_readlane_b32 s19, v41, 39
+; SI-NEXT:    buffer_store_dword v13, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 52, v0
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_lshl_b32 s18, s18, 24
+; SI-NEXT:    buffer_store_dword v14, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 56, v0
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    v_readlane_b32 s18, v41, 40
+; SI-NEXT:    buffer_store_dword v15, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 60, v0
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    v_readlane_b32 s19, v41, 41
+; SI-NEXT:    buffer_store_dword v16, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 64, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    s_lshl_b32 s17, s18, 8
+; SI-NEXT:    v_readlane_b32 s18, v41, 42
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v8
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s40, 0xff
+; SI-NEXT:    v_readlane_b32 s19, v41, 43
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    s_and_b32 s17, s18, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v41, 44
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_lshl_b32 s18, s18, 24
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x44, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v43, 47
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v57
-; SI-NEXT:    s_and_b32 s4, s13, 0xff
-; SI-NEXT:    v_or_b32_e32 v3, v3, v7
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v47
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s30, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 24, v45
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s13, s36, 24
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v7, v8, v7
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s13, s5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 64, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v7, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s41, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 8
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_readlane_b32 s17, v43, 46
+; SI-NEXT:    s_and_b32 s17, s17, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v43, 45
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_lshl_b32 s18, s18, 24
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x48, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x44, v0
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v44
-; SI-NEXT:    s_and_b32 s4, s12, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s89, 8
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v43
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s93, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    v_readlane_b32 s16, v41, 46
+; SI-NEXT:    s_and_b32 s14, s14, 0xff
+; SI-NEXT:    v_readlane_b32 s17, v41, 47
+; SI-NEXT:    s_lshl_b32 s16, s16, 8
+; SI-NEXT:    v_readlane_b32 s19, v41, 45
+; SI-NEXT:    s_or_b32 s14, s14, s16
+; SI-NEXT:    v_readlane_b32 s16, v41, 48
+; SI-NEXT:    v_readlane_b32 s17, v41, 49
+; SI-NEXT:    s_and_b32 s16, s16, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v41, 50
+; SI-NEXT:    s_lshl_b32 s16, s16, 16
+; SI-NEXT:    s_lshl_b32 s17, s18, 24
+; SI-NEXT:    s_and_b32 s14, s14, 0xffff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x4c, v0
+; SI-NEXT:    s_or_b32 s14, s14, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v41
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s12, s31, 24
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v5, v7, v5
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s12, s5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s14
+; SI-NEXT:    s_and_b32 s14, s15, 0xff
+; SI-NEXT:    v_readlane_b32 s15, v43, 50
+; SI-NEXT:    s_lshl_b32 s15, s15, 8
+; SI-NEXT:    s_or_b32 s14, s14, s15
+; SI-NEXT:    v_readlane_b32 s15, v43, 49
+; SI-NEXT:    s_and_b32 s15, s15, 0xff
+; SI-NEXT:    v_readlane_b32 s16, v43, 48
+; SI-NEXT:    s_lshl_b32 s15, s15, 16
+; SI-NEXT:    s_lshl_b32 s16, s16, 24
+; SI-NEXT:    s_and_b32 s14, s14, 0xffff
+; SI-NEXT:    s_or_b32 s15, s16, s15
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x50, v0
+; SI-NEXT:    s_or_b32 s14, s14, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
-; SI-NEXT:    v_mov_b32_e32 v5, s4
-; SI-NEXT:    v_readlane_b32 s5, v62, 1
-; SI-NEXT:    buffer_store_dword v5, v3, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v13
+; SI-NEXT:    v_mov_b32_e32 v2, s14
+; SI-NEXT:    v_readlane_b32 s14, v41, 52
+; SI-NEXT:    s_and_b32 s12, s12, 0xff
+; SI-NEXT:    v_readlane_b32 s15, v41, 53
+; SI-NEXT:    s_lshl_b32 s14, s14, 8
+; SI-NEXT:    s_or_b32 s12, s12, s14
+; SI-NEXT:    v_readlane_b32 s14, v41, 54
+; SI-NEXT:    v_readlane_b32 s15, v41, 55
+; SI-NEXT:    s_and_b32 s14, s14, 0xff
+; SI-NEXT:    v_readlane_b32 s16, v41, 56
+; SI-NEXT:    s_lshl_b32 s14, s14, 16
+; SI-NEXT:    s_lshl_b32 s15, s16, 24
+; SI-NEXT:    s_and_b32 s12, s12, 0xffff
+; SI-NEXT:    s_or_b32 s14, s15, s14
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x54, v0
+; SI-NEXT:    s_or_b32 s12, s12, s14
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v24
-; SI-NEXT:    s_and_b32 s4, s11, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v22
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s90, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v21
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s11, s94, 24
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v5, v7, v5
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s11, s5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x50, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s5, v62, 3
-; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    s_and_b32 s12, s13, 0xff
+; SI-NEXT:    v_readlane_b32 s13, v43, 53
+; SI-NEXT:    s_lshl_b32 s13, s13, 8
+; SI-NEXT:    s_or_b32 s12, s12, s13
+; SI-NEXT:    v_readlane_b32 s13, v43, 52
+; SI-NEXT:    s_and_b32 s13, s13, 0xff
+; SI-NEXT:    v_readlane_b32 s14, v43, 51
+; SI-NEXT:    s_lshl_b32 s13, s13, 16
+; SI-NEXT:    s_lshl_b32 s14, s14, 24
+; SI-NEXT:    s_and_b32 s12, s12, 0xffff
+; SI-NEXT:    s_or_b32 s13, s14, s13
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x58, v0
+; SI-NEXT:    s_or_b32 s12, s12, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x54, v0
-; SI-NEXT:    v_mov_b32_e32 v5, s4
-; SI-NEXT:    s_and_b32 s4, s10, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    buffer_store_dword v5, v3, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v9
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    v_readlane_b32 s12, v41, 58
+; SI-NEXT:    s_and_b32 s10, s10, 0xff
+; SI-NEXT:    v_readlane_b32 s13, v41, 59
+; SI-NEXT:    s_lshl_b32 s12, s12, 8
+; SI-NEXT:    s_or_b32 s10, s10, s12
+; SI-NEXT:    v_readlane_b32 s12, v41, 60
+; SI-NEXT:    v_readlane_b32 s13, v41, 61
+; SI-NEXT:    s_and_b32 s12, s12, 0xff
+; SI-NEXT:    v_readlane_b32 s14, v41, 62
+; SI-NEXT:    s_lshl_b32 s12, s12, 16
+; SI-NEXT:    s_lshl_b32 s13, s14, 24
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_or_b32 s12, s13, s12
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x5c, v0
+; SI-NEXT:    s_or_b32 s10, s10, s12
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v20
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s5, v62, 2
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v18
-; SI-NEXT:    s_and_b32 s5, s5, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v17
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s10, s91, 24
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v5, v7, v5
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s10, s5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x58, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s5, v62, 6
-; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s10, s11, 0xff
+; SI-NEXT:    v_readlane_b32 s11, v43, 56
+; SI-NEXT:    s_lshl_b32 s11, s11, 8
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    v_readlane_b32 s11, v43, 55
+; SI-NEXT:    s_and_b32 s11, s11, 0xff
+; SI-NEXT:    v_readlane_b32 s12, v43, 54
+; SI-NEXT:    s_lshl_b32 s11, s11, 16
+; SI-NEXT:    s_lshl_b32 s12, s12, 24
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_or_b32 s11, s12, s11
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x60, v0
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x5c, v0
-; SI-NEXT:    v_mov_b32_e32 v5, s4
-; SI-NEXT:    s_and_b32 s4, s9, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    buffer_store_dword v5, v3, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v6
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    v_readlane_b32 s10, v43, 0
+; SI-NEXT:    s_and_b32 s8, s8, 0xff
+; SI-NEXT:    v_readlane_b32 s11, v43, 1
+; SI-NEXT:    s_lshl_b32 s10, s10, 8
+; SI-NEXT:    s_or_b32 s8, s8, s10
+; SI-NEXT:    v_readlane_b32 s10, v43, 2
+; SI-NEXT:    v_readlane_b32 s11, v43, 3
+; SI-NEXT:    s_and_b32 s10, s10, 0xff
+; SI-NEXT:    v_readlane_b32 s12, v43, 4
+; SI-NEXT:    s_lshl_b32 s10, s10, 16
+; SI-NEXT:    s_lshl_b32 s11, s12, 24
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    s_or_b32 s10, s11, s10
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x64, v0
+; SI-NEXT:    s_or_b32 s8, s8, s10
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v61
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s5, v62, 5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v60
-; SI-NEXT:    s_and_b32 s5, s5, 0xff
-; SI-NEXT:    v_readlane_b32 s9, v62, 4
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v59
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s9, s9, 24
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s9, s5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x60, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s5, v62, 9
-; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s8, s9, 0xff
+; SI-NEXT:    v_readlane_b32 s9, v43, 59
+; SI-NEXT:    s_lshl_b32 s9, s9, 8
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    v_readlane_b32 s9, v43, 58
+; SI-NEXT:    s_and_b32 s9, s9, 0xff
+; SI-NEXT:    v_readlane_b32 s10, v43, 57
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s10, s10, 24
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    s_or_b32 s9, s10, s9
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x68, v0
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
-; SI-NEXT:    v_mov_b32_e32 v5, s4
-; SI-NEXT:    s_and_b32 s4, s8, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    buffer_store_dword v5, v3, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v58
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s5, v62, 8
-; SI-NEXT:    v_or_b32_e32 v3, v3, v4
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v56
-; SI-NEXT:    s_and_b32 s5, s5, 0xff
-; SI-NEXT:    v_readlane_b32 s8, v62, 7
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_readlane_b32 s8, v43, 6
+; SI-NEXT:    s_and_b32 s6, s6, 0xff
+; SI-NEXT:    v_readlane_b32 s9, v43, 7
+; SI-NEXT:    s_lshl_b32 s8, s8, 8
+; SI-NEXT:    s_or_b32 s6, s6, s8
+; SI-NEXT:    v_readlane_b32 s8, v43, 8
+; SI-NEXT:    v_readlane_b32 s9, v43, 9
+; SI-NEXT:    s_and_b32 s8, s8, 0xff
+; SI-NEXT:    v_readlane_b32 s10, v43, 10
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_lshl_b32 s9, s10, 24
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_or_b32 s8, s9, s8
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x6c, v0
+; SI-NEXT:    s_or_b32 s6, s6, s8
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v46
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s6, s7, 0xff
+; SI-NEXT:    v_readlane_b32 s7, v43, 62
+; SI-NEXT:    s_lshl_b32 s7, s7, 8
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    v_readlane_b32 s7, v43, 61
+; SI-NEXT:    s_and_b32 s7, s7, 0xff
+; SI-NEXT:    v_readlane_b32 s8, v43, 60
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
 ; SI-NEXT:    s_lshl_b32 s8, s8, 24
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v4, v5, v4
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s8, s5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v4
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x68, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s5, v62, 12
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x6c, v0
-; SI-NEXT:    v_mov_b32_e32 v4, s4
-; SI-NEXT:    s_and_b32 s4, s7, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v42
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s5, v62, 11
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v40
-; SI-NEXT:    s_and_b32 s5, s5, 0xff
-; SI-NEXT:    v_readlane_b32 s7, v62, 10
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_or_b32 s7, s8, s7
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x70, v0
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v55
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s7, s7, 24
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_or_b32_e32 v3, v4, v3
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    v_readlane_b32 s6, v43, 12
+; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    v_readlane_b32 s7, v43, 13
+; SI-NEXT:    s_lshl_b32 s6, s6, 8
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    v_readlane_b32 s6, v43, 14
+; SI-NEXT:    v_readlane_b32 s7, v43, 15
+; SI-NEXT:    s_and_b32 s6, s6, 0xff
+; SI-NEXT:    v_readlane_b32 s8, v43, 16
+; SI-NEXT:    s_lshl_b32 s6, s6, 16
+; SI-NEXT:    s_lshl_b32 s7, s8, 24
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s7, s5
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x70, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s5, v62, 15
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    s_or_b32 s6, s7, s6
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x74, v0
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
-; SI-NEXT:    v_mov_b32_e32 v3, s4
-; SI-NEXT:    s_and_b32 s4, s6, 0xff
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    s_and_b32 s4, s5, 0xff
+; SI-NEXT:    v_readlane_b32 s5, v42, 1
 ; SI-NEXT:    s_lshl_b32 s5, s5, 8
-; SI-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v54
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_readlane_b32 s5, v62, 14
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v53
+; SI-NEXT:    v_readlane_b32 s5, v42, 0
 ; SI-NEXT:    s_and_b32 s5, s5, 0xff
-; SI-NEXT:    v_readlane_b32 s6, v62, 13
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v52
+; SI-NEXT:    v_readlane_b32 s6, v43, 63
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
 ; SI-NEXT:    s_lshl_b32 s6, s6, 24
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s6, s5
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x78, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    v_readlane_b32 s21, v41, 33
+; SI-NEXT:    v_readlane_b32 s19, v41, 51
+; SI-NEXT:    v_readlane_b32 s17, v41, 57
+; SI-NEXT:    v_readlane_b32 s15, v41, 63
+; SI-NEXT:    v_readlane_b32 s13, v43, 5
+; SI-NEXT:    v_readlane_b32 s11, v43, 11
+; SI-NEXT:    v_readlane_b32 s9, v43, 17
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    v_readlane_b32 s36, v63, 4
-; SI-NEXT:    v_readlane_b32 s31, v63, 1
-; SI-NEXT:    v_readlane_b32 s30, v63, 0
+; SI-NEXT:    v_readlane_b32 s99, v40, 35
+; SI-NEXT:    v_readlane_b32 s98, v40, 34
+; SI-NEXT:    v_readlane_b32 s97, v40, 33
+; SI-NEXT:    v_readlane_b32 s96, v40, 32
+; SI-NEXT:    v_readlane_b32 s87, v40, 31
+; SI-NEXT:    v_readlane_b32 s86, v40, 30
+; SI-NEXT:    v_readlane_b32 s85, v40, 29
+; SI-NEXT:    v_readlane_b32 s84, v40, 28
+; SI-NEXT:    v_readlane_b32 s83, v40, 27
+; SI-NEXT:    v_readlane_b32 s82, v40, 26
+; SI-NEXT:    v_readlane_b32 s81, v40, 25
+; SI-NEXT:    v_readlane_b32 s80, v40, 24
+; SI-NEXT:    v_readlane_b32 s71, v40, 23
+; SI-NEXT:    v_readlane_b32 s70, v40, 22
+; SI-NEXT:    v_readlane_b32 s69, v40, 21
+; SI-NEXT:    v_readlane_b32 s68, v40, 20
+; SI-NEXT:    v_readlane_b32 s67, v40, 19
+; SI-NEXT:    v_readlane_b32 s66, v40, 18
+; SI-NEXT:    v_readlane_b32 s65, v40, 17
+; SI-NEXT:    v_readlane_b32 s64, v40, 16
+; SI-NEXT:    v_readlane_b32 s55, v40, 15
+; SI-NEXT:    v_readlane_b32 s54, v40, 14
+; SI-NEXT:    v_readlane_b32 s53, v40, 13
+; SI-NEXT:    v_readlane_b32 s52, v40, 12
+; SI-NEXT:    v_readlane_b32 s51, v40, 11
+; SI-NEXT:    v_readlane_b32 s50, v40, 10
+; SI-NEXT:    v_readlane_b32 s49, v40, 9
+; SI-NEXT:    v_readlane_b32 s48, v40, 8
+; SI-NEXT:    v_readlane_b32 s39, v40, 7
+; SI-NEXT:    v_readlane_b32 s38, v40, 6
+; SI-NEXT:    v_readlane_b32 s37, v40, 5
+; SI-NEXT:    v_readlane_b32 s36, v40, 4
+; SI-NEXT:    v_readlane_b32 s35, v40, 3
+; SI-NEXT:    v_readlane_b32 s34, v40, 2
+; SI-NEXT:    v_readlane_b32 s31, v40, 1
+; SI-NEXT:    v_readlane_b32 s30, v40, 0
 ; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB99_4:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    ; implicit-def: $vcc_lo
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    ; kill: killed $vcc_lo
-; SI-NEXT:    ; implicit-def: $vcc_lo
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr16
+; SI-NEXT:    v_writelane_b32 v41, s4, 20
+; SI-NEXT:    v_writelane_b32 v41, s5, 21
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr92
 ; SI-NEXT:    ; implicit-def: $sgpr56
-; SI-NEXT:    ; implicit-def: $sgpr57
-; SI-NEXT:    ; implicit-def: $sgpr68
-; SI-NEXT:    ; implicit-def: $sgpr71
-; SI-NEXT:    ; implicit-def: $sgpr83
-; SI-NEXT:    ; implicit-def: $sgpr46
-; SI-NEXT:    ; implicit-def: $vgpr51
-; SI-NEXT:    ; implicit-def: $sgpr47
-; SI-NEXT:    ; implicit-def: $sgpr65
-; SI-NEXT:    ; implicit-def: $sgpr69
-; SI-NEXT:    ; implicit-def: $sgpr80
 ; SI-NEXT:    ; implicit-def: $sgpr44
-; SI-NEXT:    ; implicit-def: $sgpr45
-; SI-NEXT:    ; implicit-def: $sgpr54
-; SI-NEXT:    ; implicit-def: $sgpr66
+; SI-NEXT:    ; implicit-def: $sgpr82
+; SI-NEXT:    ; implicit-def: $sgpr80
 ; SI-NEXT:    ; implicit-def: $sgpr70
-; SI-NEXT:    ; implicit-def: $sgpr42
-; SI-NEXT:    ; implicit-def: $sgpr43
-; SI-NEXT:    ; implicit-def: $sgpr51
-; SI-NEXT:    ; implicit-def: $sgpr55
-; SI-NEXT:    ; implicit-def: $sgpr67
-; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $sgpr41
-; SI-NEXT:    ; implicit-def: $sgpr48
-; SI-NEXT:    ; implicit-def: $sgpr52
+; SI-NEXT:    ; implicit-def: $sgpr74
+; SI-NEXT:    ; implicit-def: $sgpr68
+; SI-NEXT:    ; implicit-def: $sgpr66
 ; SI-NEXT:    ; implicit-def: $sgpr64
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $sgpr40
-; SI-NEXT:    ; implicit-def: $sgpr37
-; SI-NEXT:    ; implicit-def: $sgpr49
-; SI-NEXT:    ; implicit-def: $sgpr53
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $sgpr15
-; SI-NEXT:    ; implicit-def: $sgpr34
-; SI-NEXT:    ; implicit-def: $sgpr38
+; SI-NEXT:    ; implicit-def: $sgpr72
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; implicit-def: $sgpr52
 ; SI-NEXT:    ; implicit-def: $sgpr50
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $sgpr14
-; SI-NEXT:    ; implicit-def: $sgpr95
-; SI-NEXT:    ; implicit-def: $sgpr35
-; SI-NEXT:    ; implicit-def: $sgpr39
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr57
-; SI-NEXT:    ; implicit-def: $vgpr47
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $sgpr13
-; SI-NEXT:    ; implicit-def: $sgpr92
-; SI-NEXT:    ; implicit-def: $sgpr30
+; SI-NEXT:    ; implicit-def: $sgpr62
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr38
 ; SI-NEXT:    ; implicit-def: $sgpr36
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr44
-; SI-NEXT:    ; implicit-def: $vgpr43
-; SI-NEXT:    ; implicit-def: $vgpr41
-; SI-NEXT:    ; implicit-def: $sgpr12
-; SI-NEXT:    ; implicit-def: $sgpr89
-; SI-NEXT:    ; implicit-def: $sgpr93
-; SI-NEXT:    ; implicit-def: $sgpr31
-; SI-NEXT:    ; implicit-def: $sgpr11
-; SI-NEXT:    ; implicit-def: $sgpr90
+; SI-NEXT:    ; implicit-def: $sgpr58
+; SI-NEXT:    ; implicit-def: $sgpr34
+; SI-NEXT:    ; implicit-def: $sgpr30
 ; SI-NEXT:    ; implicit-def: $sgpr94
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr90
+; SI-NEXT:    ; implicit-def: $sgpr42
+; SI-NEXT:    ; implicit-def: $sgpr40
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr12
 ; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $sgpr91
-; SI-NEXT:    ; implicit-def: $sgpr9
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; kill: killed $vcc_lo
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $vgpr61
-; SI-NEXT:    ; implicit-def: $vgpr60
-; SI-NEXT:    ; implicit-def: $vgpr59
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr58
-; SI-NEXT:    ; implicit-def: $vgpr56
-; SI-NEXT:    ; implicit-def: $vgpr46
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr42
-; SI-NEXT:    ; implicit-def: $vgpr40
-; SI-NEXT:    ; implicit-def: $vgpr55
-; SI-NEXT:    ; implicit-def: $vgpr54
-; SI-NEXT:    ; implicit-def: $vgpr53
-; SI-NEXT:    ; implicit-def: $vgpr52
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    ; implicit-def: $vcc_lo
-; SI-NEXT:    ; kill: killed $vcc_lo
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; kill: killed $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
 ; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    v_writelane_b32 v41, s4, 22
+; SI-NEXT:    v_writelane_b32 v41, s5, 23
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v41, s4, 24
+; SI-NEXT:    v_writelane_b32 v41, s5, 25
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v41, s4, 26
+; SI-NEXT:    v_writelane_b32 v41, s5, 27
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v41, s4, 28
+; SI-NEXT:    v_writelane_b32 v41, s5, 29
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; kill: killed $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v41, s4, 30
+; SI-NEXT:    v_writelane_b32 v41, s5, 31
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v41, s4, 32
+; SI-NEXT:    v_writelane_b32 v41, s5, 33
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v41, s4, 34
+; SI-NEXT:    v_writelane_b32 v41, s5, 35
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v41, s4, 36
+; SI-NEXT:    v_writelane_b32 v41, s5, 37
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v41, s4, 38
+; SI-NEXT:    v_writelane_b32 v41, s5, 39
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v41, s4, 40
+; SI-NEXT:    v_writelane_b32 v41, s5, 41
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v41, s4, 42
+; SI-NEXT:    v_writelane_b32 v41, s5, 43
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v41, s4, 44
+; SI-NEXT:    v_writelane_b32 v41, s5, 45
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v41, s4, 46
+; SI-NEXT:    v_writelane_b32 v41, s5, 47
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v41, s4, 48
+; SI-NEXT:    v_writelane_b32 v41, s5, 49
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v41, s4, 50
+; SI-NEXT:    v_writelane_b32 v41, s5, 51
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v41, s4, 52
+; SI-NEXT:    v_writelane_b32 v41, s5, 53
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v41, s4, 54
+; SI-NEXT:    v_writelane_b32 v41, s5, 55
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v41, s4, 56
+; SI-NEXT:    v_writelane_b32 v41, s5, 57
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v41, s4, 58
+; SI-NEXT:    v_writelane_b32 v41, s5, 59
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v41, s4, 60
+; SI-NEXT:    v_writelane_b32 v41, s5, 61
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v41, s4, 62
+; SI-NEXT:    v_writelane_b32 v41, s5, 63
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_writelane_b32 v43, s4, 0
+; SI-NEXT:    v_writelane_b32 v43, s5, 1
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v43, s4, 2
+; SI-NEXT:    v_writelane_b32 v43, s5, 3
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v43, s4, 4
+; SI-NEXT:    v_writelane_b32 v43, s5, 5
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v43, s4, 6
+; SI-NEXT:    v_writelane_b32 v43, s5, 7
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v43, s4, 8
+; SI-NEXT:    v_writelane_b32 v43, s5, 9
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v43, s4, 10
+; SI-NEXT:    v_writelane_b32 v43, s5, 11
+; SI-NEXT:    v_writelane_b32 v43, s16, 12
+; SI-NEXT:    v_writelane_b32 v43, s17, 13
+; SI-NEXT:    ; implicit-def: $sgpr16
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v43, s16, 14
+; SI-NEXT:    v_writelane_b32 v43, s17, 15
+; SI-NEXT:    ; implicit-def: $sgpr16
+; SI-NEXT:    v_writelane_b32 v43, s16, 16
+; SI-NEXT:    v_writelane_b32 v43, s17, 17
 ; SI-NEXT:    s_branch .LBB99_2
 ;
 ; VI-LABEL: bitcast_v64i16_to_v128i8_scalar:
@@ -227209,7 +229205,6 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-LABEL: bitcast_v64bf16_to_v64i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v50, v22
 ; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
@@ -227226,1018 +229221,1096 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    v_mul_f32_e32 v58, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v2
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v50
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:36
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:68
+; SI-NEXT:    s_waitcnt expcnt(6)
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:72
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:76
+; SI-NEXT:    v_mul_f32_e32 v38, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v4
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v62, v28
-; SI-NEXT:    v_mul_f32_e32 v61, 1.0, v10
-; SI-NEXT:    v_mul_f32_e32 v10, 1.0, v62
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v5
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v3
-; SI-NEXT:    v_mul_f32_e32 v22, 1.0, v4
-; SI-NEXT:    v_mul_f32_e32 v60, 1.0, v5
-; SI-NEXT:    v_mul_f32_e32 v59, 1.0, v6
-; SI-NEXT:    v_mul_f32_e32 v29, 1.0, v7
-; SI-NEXT:    v_mul_f32_e32 v28, 1.0, v8
-; SI-NEXT:    v_mul_f32_e32 v63, 1.0, v9
-; SI-NEXT:    v_mul_f32_e32 v37, 1.0, v11
-; SI-NEXT:    v_mul_f32_e32 v11, 1.0, v12
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v12
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v20
+; SI-NEXT:    v_mul_f32_e32 v59, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v6
+; SI-NEXT:    v_mul_f32_e32 v50, 1.0, v7
+; SI-NEXT:    v_mul_f32_e32 v8, 1.0, v9
+; SI-NEXT:    v_mul_f32_e32 v6, 1.0, v10
+; SI-NEXT:    v_mul_f32_e32 v45, 1.0, v11
 ; SI-NEXT:    v_mul_f32_e32 v12, 1.0, v13
-; SI-NEXT:    v_mul_f32_e32 v30, 1.0, v14
-; SI-NEXT:    v_mul_f32_e32 v38, 1.0, v15
-; SI-NEXT:    v_mul_f32_e32 v34, 1.0, v17
-; SI-NEXT:    v_mul_f32_e32 v33, 1.0, v18
-; SI-NEXT:    v_mul_f32_e32 v15, 1.0, v20
-; SI-NEXT:    v_mul_f32_e32 v39, 1.0, v21
-; SI-NEXT:    v_mul_f32_e32 v14, 1.0, v23
-; SI-NEXT:    v_mul_f32_e32 v21, 1.0, v24
-; SI-NEXT:    v_mul_f32_e32 v50, 1.0, v25
-; SI-NEXT:    v_mul_f32_e32 v25, 1.0, v26
-; SI-NEXT:    v_mul_f32_e32 v13, 1.0, v27
-; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v18, 1.0, s22
-; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s23
-; SI-NEXT:    v_mul_f32_e64 v20, 1.0, s26
+; SI-NEXT:    v_mul_f32_e32 v10, 1.0, v14
+; SI-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; SI-NEXT:    v_mul_f32_e32 v14, 1.0, v18
+; SI-NEXT:    v_mul_f32_e32 v13, 1.0, v19
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; SI-NEXT:    v_mul_f32_e32 v18, 1.0, v21
+; SI-NEXT:    v_mul_f32_e32 v16, 1.0, v22
+; SI-NEXT:    v_mul_f32_e32 v19, 1.0, v23
+; SI-NEXT:    v_mul_f32_e32 v23, 1.0, v25
+; SI-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; SI-NEXT:    v_mul_f32_e32 v63, 1.0, v28
+; SI-NEXT:    v_mul_f32_e32 v7, 1.0, v29
+; SI-NEXT:    v_mul_f32_e32 v11, 1.0, v30
+; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v30, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v22, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v20, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v29, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v28, 1.0, s22
+; SI-NEXT:    v_mul_f32_e64 v9, 1.0, s27
+; SI-NEXT:    v_mul_f32_e64 v21, 1.0, s28
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
-; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v16
-; SI-NEXT:    v_mul_f32_e32 v16, 1.0, v19
-; SI-NEXT:    v_mul_f32_e32 v9, 1.0, v32
-; SI-NEXT:    v_mul_f32_e32 v27, 1.0, v35
+; SI-NEXT:    v_mul_f32_e32 v31, 1.0, v17
+; SI-NEXT:    v_mul_f32_e32 v17, 1.0, v24
+; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v26
+; SI-NEXT:    v_mul_f32_e32 v62, 1.0, v32
+; SI-NEXT:    v_mul_f32_e32 v61, 1.0, v33
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mul_f32_e32 v35, 1.0, v36
+; SI-NEXT:    v_mul_f32_e32 v53, 1.0, v34
+; SI-NEXT:    v_mul_f32_e32 v35, 1.0, v35
+; SI-NEXT:    v_mul_f32_e32 v25, 1.0, v36
+; SI-NEXT:    v_mul_f32_e32 v60, 1.0, v37
+; SI-NEXT:    v_mul_f32_e32 v37, 1.0, v39
 ; SI-NEXT:    v_mul_f32_e32 v36, 1.0, v48
-; SI-NEXT:    v_mul_f32_e32 v8, 1.0, v49
-; SI-NEXT:    v_mul_f32_e32 v26, 1.0, v51
-; SI-NEXT:    v_mul_f32_e32 v49, 1.0, v52
-; SI-NEXT:    v_mul_f32_e32 v48, 1.0, v53
-; SI-NEXT:    v_mul_f32_e32 v24, 1.0, v54
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v49
+; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v51
+; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v52
+; SI-NEXT:    v_mul_f32_e32 v39, 1.0, v54
 ; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_mul_f32_e32 v23, 1.0, v55
+; SI-NEXT:    v_mul_f32_e32 v51, 1.0, v55
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_mul_f32_e32 v53, 1.0, v40
+; SI-NEXT:    v_mul_f32_e32 v55, 1.0, v40
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_mul_f32_e32 v54, 1.0, v41
+; SI-NEXT:    v_mul_f32_e32 v42, 1.0, v41
 ; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_mul_f32_e32 v7, 1.0, v42
+; SI-NEXT:    v_mul_f32_e32 v48, 1.0, v43
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_mul_f32_e32 v6, 1.0, v43
+; SI-NEXT:    v_mul_f32_e32 v49, 1.0, v44
 ; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_mul_f32_e32 v41, 1.0, v44
+; SI-NEXT:    v_mul_f32_e32 v47, 1.0, v46
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_mul_f32_e32 v40, 1.0, v45
+; SI-NEXT:    v_mul_f32_e32 v43, 1.0, v57
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_mul_f32_e32 v51, 1.0, v46
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_mul_f32_e32 v47, 1.0, v47
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_mul_f32_e32 v62, 1.0, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    v_mul_f32_e32 v42, 1.0, v56
-; SI-NEXT:    v_mul_f32_e32 v55, 1.0, v57
-; SI-NEXT:    v_mul_f32_e64 v46, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v56, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v45, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v44, 1.0, s24
-; SI-NEXT:    v_mul_f32_e64 v43, 1.0, s25
-; SI-NEXT:    v_mul_f32_e64 v19, 1.0, s27
-; SI-NEXT:    v_mul_f32_e64 v52, 1.0, s28
+; SI-NEXT:    v_mul_f32_e32 v58, 1.0, v58
+; SI-NEXT:    v_mul_f32_e64 v54, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v26, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v52, 1.0, s24
+; SI-NEXT:    v_mul_f32_e64 v34, 1.0, s25
+; SI-NEXT:    v_mul_f32_e64 v33, 1.0, s26
 ; SI-NEXT:    v_mul_f32_e64 v32, 1.0, s29
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mul_f32_e32 v17, 1.0, v2
-; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s16
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v17, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v26, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB105_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v46
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v20
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v54
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v26
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v52
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v21
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v50
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v45
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v15
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v29
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v14
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v13
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v56
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v16
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v37
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v19
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v44
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v24
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v11
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v27
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v52
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v52, v38
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v11
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v38
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v32
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v32, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v62
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v39
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v14
-; SI-NEXT:    v_mov_b32_e32 v14, v21
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v35
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v13
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v25
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v9
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v36
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v34
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v57, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v24
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v39
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v22
-; SI-NEXT:    v_mov_b32_e32 v22, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v33
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
-; SI-NEXT:    v_mov_b32_e32 v16, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v51
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v10, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v27
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v48
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v49
+; SI-NEXT:    v_mov_b32_e32 v25, v1
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v58
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v30
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v34
+; SI-NEXT:    v_mov_b32_e32 v57, v13
+; SI-NEXT:    v_mov_b32_e32 v40, v3
+; SI-NEXT:    v_mov_b32_e32 v54, v50
+; SI-NEXT:    v_mov_b32_e32 v46, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
+; SI-NEXT:    v_mov_b32_e32 v44, v15
+; SI-NEXT:    v_mov_b32_e32 v9, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v43
-; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v58
-; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v60
 ; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v59
-; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v63
-; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v61
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; SI-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v28, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v62
-; SI-NEXT:    v_mov_b32_e32 v46, v27
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v35
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v36
-; SI-NEXT:    v_mov_b32_e32 v45, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v48
-; SI-NEXT:    v_mov_b32_e32 v44, v24
-; SI-NEXT:    v_mov_b32_e32 v43, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v53
-; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v54
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    v_mov_b32_e32 v7, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v10
+; SI-NEXT:    v_mov_b32_e32 v41, v27
+; SI-NEXT:    v_mov_b32_e32 v52, v62
+; SI-NEXT:    v_mov_b32_e32 v21, v58
+; SI-NEXT:    v_mov_b32_e32 v58, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v28
+; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v33
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v32
+; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v38
+; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v53
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v60
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v37
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v56
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v55
+; SI-NEXT:    v_mov_b32_e32 v55, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v42
+; SI-NEXT:    v_mov_b32_e32 v56, v47
+; SI-NEXT:    v_mov_b32_e32 v53, v5
+; SI-NEXT:    v_mov_b32_e32 v42, v43
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v13
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v3
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v50
+; SI-NEXT:    s_waitcnt vmcnt(2) expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v19
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v17
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v11
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
+; SI-NEXT:    v_mov_b32_e32 v5, v19
+; SI-NEXT:    v_mov_b32_e32 v7, v15
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v63
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v61
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v47
+; SI-NEXT:    v_mov_b32_e32 v47, v3
+; SI-NEXT:    v_mov_b32_e32 v3, v17
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v41
-; SI-NEXT:    v_mov_b32_e32 v56, v51
-; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v42
-; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v55
-; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_mov_b32_e32 v29, v2
-; SI-NEXT:    v_mov_b32_e32 v2, v37
-; SI-NEXT:    v_mov_b32_e32 v37, v34
-; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v50
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v26
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v49
-; SI-NEXT:    v_mov_b32_e32 v23, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v40
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v51
-; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v47
-; SI-NEXT:    v_mov_b32_e32 v47, v34
-; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v38
-; SI-NEXT:    v_mov_b32_e32 v34, v38
-; SI-NEXT:    v_mov_b32_e32 v38, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v43
+; SI-NEXT:    v_mov_b32_e32 v1, v13
 ; SI-NEXT:    s_branch .LBB105_3
 ; SI-NEXT:  .LBB105_2:
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_mov_b32_e32 v2, v37
-; SI-NEXT:    v_mov_b32_e32 v37, v34
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    v_mov_b32_e32 v25, v1
+; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    v_mov_b32_e32 v21, v58
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_mov_b32_e32 v52, v62
+; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr62
+; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr28
+; SI-NEXT:    ; implicit-def: $vgpr58
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    ; implicit-def: $vgpr26
+; SI-NEXT:    ; implicit-def: $vgpr59
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr24
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    ; implicit-def: $vgpr20
+; SI-NEXT:    ; implicit-def: $vgpr55
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr16
+; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; kill: killed $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr48
+; SI-NEXT:    ; implicit-def: $vgpr22
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr18
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    v_mov_b32_e32 v7, v6
-; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr14
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr12
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr6
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr10
+; SI-NEXT:    ; implicit-def: $vgpr49
+; SI-NEXT:    ; implicit-def: $vgpr8
+; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; kill: killed $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; kill: killed $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr6
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; kill: killed $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    v_mov_b32_e32 v52, v38
-; SI-NEXT:    v_mov_b32_e32 v22, v11
-; SI-NEXT:    v_mov_b32_e32 v32, v31
-; SI-NEXT:    v_mov_b32_e32 v16, v15
-; SI-NEXT:    v_mov_b32_e32 v14, v21
-; SI-NEXT:    v_mov_b32_e32 v28, v13
-; SI-NEXT:    v_mov_b32_e32 v10, v9
-; SI-NEXT:    v_mov_b32_e32 v57, v8
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v54, v50
+; SI-NEXT:    v_mov_b32_e32 v56, v47
+; SI-NEXT:    v_mov_b32_e32 v9, v11
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v53, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_mov_b32_e32 v46, v27
-; SI-NEXT:    v_mov_b32_e32 v45, v26
-; SI-NEXT:    v_mov_b32_e32 v44, v24
-; SI-NEXT:    v_mov_b32_e32 v43, v23
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v56, v51
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v40, v3
+; SI-NEXT:    v_mov_b32_e32 v44, v15
+; SI-NEXT:    v_mov_b32_e32 v57, v13
+; SI-NEXT:    v_mov_b32_e32 v46, v19
+; SI-NEXT:    v_mov_b32_e32 v41, v27
 ; SI-NEXT:    s_mov_b64 s[4:5], -1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr6
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; kill: killed $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr58
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr60
-; SI-NEXT:    ; implicit-def: $vgpr59
-; SI-NEXT:    ; implicit-def: $vgpr63
-; SI-NEXT:    ; implicit-def: $vgpr61
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    ; implicit-def: $vgpr47
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr62
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; kill: killed $vgpr6
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr50
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; kill: killed $vgpr6
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr38
-; SI-NEXT:    ; implicit-def: $vgpr53
-; SI-NEXT:    ; implicit-def: $vgpr54
-; SI-NEXT:    ; kill: killed $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr40
-; SI-NEXT:    ; implicit-def: $vgpr41
-; SI-NEXT:    ; implicit-def: $vgpr42
+; SI-NEXT:    v_mov_b32_e32 v42, v43
+; SI-NEXT:    v_mov_b32_e32 v3, v17
+; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:  .LBB105_3: ; %Flow
-; SI-NEXT:    v_mov_b32_e32 v51, v7
-; SI-NEXT:    v_mov_b32_e32 v24, v10
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
 ; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
 ; SI-NEXT:    s_cbranch_vccnz .LBB105_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
-; SI-NEXT:    v_mov_b32_e32 v9, v2
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v30
+; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v31
+; SI-NEXT:    v_mov_b32_e32 v38, v9
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v38
+; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; SI-NEXT:    v_and_b32_e32 v38, 0xffff0000, v31
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v2
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    v_alignbit_b32 v2, v4, v2, 16
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v29
+; SI-NEXT:    v_add_f32_e32 v30, 0x40c00000, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v30
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT:    s_waitcnt vmcnt(8)
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; SI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v28
+; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v28
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    v_alignbit_b32 v2, v4, v2, 16
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v34
+; SI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v29
+; SI-NEXT:    v_and_b32_e32 v34, 0xffff0000, v30
+; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    v_alignbit_b32 v2, v4, v2, 16
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v32
+; SI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v27
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    v_alignbit_b32 v2, v4, v2, 16
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v40
+; SI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v4
+; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v26
+; SI-NEXT:    v_alignbit_b32 v2, v4, v2, 16
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v11
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v54
+; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; SI-NEXT:    v_alignbit_b32 v1, v4, v1, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v19
+; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
+; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v15
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; SI-NEXT:    v_alignbit_b32 v1, v5, v1, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v50
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v44
+; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
-; SI-NEXT:    v_alignbit_b32 v1, v6, v1, 16
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v7
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v16
+; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v13
-; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v57
+; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; SI-NEXT:    v_alignbit_b32 v1, v7, v1, 16
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v17
+; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v10
-; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v46
+; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v7
-; SI-NEXT:    v_alignbit_b32 v1, v8, v1, 16
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v22
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v13
+; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v63
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v9
-; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v41
+; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
-; SI-NEXT:    v_alignbit_b32 v1, v9, v1, 16
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v32
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v11
+; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v61
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v52
-; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v9
-; SI-NEXT:    v_alignbit_b32 v1, v10, v1, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
+; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v16
-; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; SI-NEXT:    v_and_b32_e32 v33, 0xffff0000, v7
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v60
+; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_alignbit_b32 v38, v11, v1, 16
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v14
-; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v11
-; SI-NEXT:    v_and_b32_e32 v32, 0xffff0000, v11
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_alignbit_b32 v52, v12, v1, 16
-; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v28
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
-; SI-NEXT:    v_alignbit_b32 v1, v13, v1, 16
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v46
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v24
-; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v25
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v13
-; SI-NEXT:    v_alignbit_b32 v1, v14, v1, 16
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v45
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; SI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
+; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v57
-; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; SI-NEXT:    v_alignbit_b32 v50, v15, v1, 16
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v43
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v44
-; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v15
-; SI-NEXT:    v_alignbit_b32 v1, v16, v1, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v51
-; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v16
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v12
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v56
+; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_alignbit_b32 v1, v18, v1, 16
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v21
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v56
-; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
-; SI-NEXT:    v_alignbit_b32 v39, v19, v1, 16
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v55
-; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v17
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v18
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_alignbit_b32 v41, v42, v1, 16
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
-; SI-NEXT:    v_alignbit_b32 v40, v41, v17, 16
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v19
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v17, v39
-; SI-NEXT:    v_mov_b32_e32 v39, v52
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_alignbit_b32 v48, v49, v1, 16
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v18
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v42
+; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_alignbit_b32 v53, v54, v1, 16
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v2, v2, v1, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_alignbit_b32 v26, v36, v1, 16
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT:    v_alignbit_b32 v4, v6, v4, 16
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v53
+; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT:    v_alignbit_b32 v6, v8, v6, 16
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v18
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; SI-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_alignbit_b32 v27, v35, v1, 16
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT:    v_alignbit_b32 v8, v10, v8, 16
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT:    v_alignbit_b32 v10, v12, v10, 16
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
+; SI-NEXT:    v_alignbit_b32 v12, v14, v12, 16
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v23
+; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT:    v_alignbit_b32 v14, v18, v14, 16
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v18
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v37
-; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT:    v_and_b32_e32 v37, 0xffff0000, v5
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload
+; SI-NEXT:    v_alignbit_b32 v18, v20, v18, 16
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_alignbit_b32 v62, v23, v1, 16
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; SI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v23
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v47
+; SI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
+; SI-NEXT:    v_alignbit_b32 v24, v24, v23, 16
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v26
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v1
-; SI-NEXT:    v_alignbit_b32 v1, v30, v18, 16
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; SI-NEXT:    v_alignbit_b32 v47, v21, v12, 16
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff0000, v34
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v12
-; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v12
-; SI-NEXT:    v_alignbit_b32 v31, v22, v11, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v19
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v34, 0xffff0000, v6
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; SI-NEXT:    v_alignbit_b32 v22, v21, v20, 16
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v16
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshr_b64 v[48:49], v[21:22], 16
+; SI-NEXT:    v_lshr_b64 v[49:50], v[7:8], 16
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT:    v_alignbit_b32 v12, v11, v18, 16
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; SI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v26
+; SI-NEXT:    v_alignbit_b32 v26, v59, v25, 16
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v27
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshr_b64 v[51:52], v[25:26], 16
+; SI-NEXT:    v_lshr_b64 v[52:53], v[1:2], 16
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v19
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT:    v_alignbit_b32 v63, v61, v18, 16
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v18
-; SI-NEXT:    v_alignbit_b32 v60, v59, v7, 16
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v7
-; SI-NEXT:    v_alignbit_b32 v58, v29, v6, 16
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
-; SI-NEXT:    v_alignbit_b32 v20, v19, v5, 16
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v20
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT:    v_alignbit_b32 v16, v45, v16, 16
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_alignbit_b32 v18, v6, v5, 16
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; SI-NEXT:    v_add_f32_e32 v27, 0x40c00000, v27
+; SI-NEXT:    v_alignbit_b32 v28, v58, v27, 16
+; SI-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
+; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; SI-NEXT:    v_alignbit_b32 v20, v55, v20, 16
+; SI-NEXT:    v_lshr_b64 v[36:37], v[19:20], 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v2, v6, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v6, v2, v3, 16
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v18, v5, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v63, v33, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v12, v8, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v1, v9, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; SI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v29
+; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; SI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v29
+; SI-NEXT:    v_alignbit_b32 v35, v43, v32, 16
+; SI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v30
+; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshr_b64 v[62:63], v[34:35], 16
+; SI-NEXT:    v_lshr_b64 v[33:34], v[15:16], 16
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; SI-NEXT:    v_add_f32_e32 v30, 0x40c00000, v30
+; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v30
+; SI-NEXT:    v_alignbit_b32 v39, v29, v32, 16
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[31:32], v[38:39], 16
+; SI-NEXT:    v_lshr_b64 v[37:38], v[5:6], 16
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[31:32], v[27:28], 16
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v31, v10, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[31:32], v[23:24], 16
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v47, v32, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[31:32], v[17:18], 16
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v62, v25, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[31:32], v[13:14], 16
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v27, v13, 16
-; SI-NEXT:    v_alignbit_b32 v5, v20, v4, 16
-; SI-NEXT:    v_alignbit_b32 v4, v58, v37, 16
-; SI-NEXT:    v_alignbit_b32 v3, v60, v34, 16
-; SI-NEXT:    v_mov_b32_e32 v33, v38
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[31:32], v[11:12], 16
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v2, v26, v14, 16
-; SI-NEXT:    v_alignbit_b32 v38, v53, v15, 16
-; SI-NEXT:    v_mov_b32_e32 v15, v22
-; SI-NEXT:    v_alignbit_b32 v25, v48, v16, 16
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[31:32], v[9:10], 16
+; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_lshr_b64 v[31:32], v[3:4], 16
 ; SI-NEXT:  .LBB105_5: ; %end
-; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; SI-NEXT:    v_or_b32_e32 v6, v8, v6
-; SI-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v52
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v6, v2
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 4, v0
-; SI-NEXT:    buffer_store_dword v2, v6, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v39
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_or_b32_e32 v2, v2, v6
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 8, v0
-; SI-NEXT:    buffer_store_dword v2, v6, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 4, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v62
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_or_b32_e32 v2, v2, v6
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 12, v0
-; SI-NEXT:    buffer_store_dword v2, v6, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 8, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v35
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v43
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v29
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_or_b32_e32 v2, v2, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 16, v0
-; SI-NEXT:    buffer_store_dword v2, v5, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v19
-; SI-NEXT:    v_or_b32_e32 v2, v2, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 20, v0
-; SI-NEXT:    buffer_store_dword v2, v5, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v28
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v58
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v51
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_or_b32_e32 v2, v2, v4
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 24, v0
-; SI-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v58
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v29
-; SI-NEXT:    v_or_b32_e32 v2, v2, v4
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 28, v0
-; SI-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v26
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v59
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v25
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v60
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v59
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v24
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 36, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v36
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 40, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v63
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v61
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v20
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v55
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 44, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v33
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v11
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v45
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v48
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v30
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v22
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v33
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v15
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 64, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v31
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v15
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v18
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x44, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v39
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v15
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x48, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v47
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v21
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v14
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v13
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x50, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x50, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v62
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v23
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v12
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x54, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v11
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x58, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v27
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v35
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v10
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x5c, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v50
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v49
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x60, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v36
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v38
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v37
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v53
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v54
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v6
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x6c, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v25
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v31
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x70, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v48
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v49
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x74, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x78, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v41
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v42
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
@@ -233541,120 +235614,138 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:76
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:52
 ; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:56
 ; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:68
 ; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:72
+; SI-NEXT:    v_cvt_f16_f32_e32 v37, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v42, v3
 ; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:28
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v30
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:32
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:36
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, s28
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:60
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:68
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:76
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, v5
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v46, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v30, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v44, v8
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v38, v9
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v62, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v49, s21
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, v13
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v11
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, v16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v25
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v27
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, v25
-; SI-NEXT:    v_cvt_f16_f32_e32 v19, v26
-; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v29
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, v30
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, s17
-; SI-NEXT:    v_cvt_f16_f32_e32 v29, s26
-; SI-NEXT:    v_cvt_f16_f32_e32 v30, s27
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v15
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v17
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, s23
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v22
+; SI-NEXT:    v_cvt_f16_f32_e32 v22, v13
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v23
+; SI-NEXT:    v_cvt_f16_f32_e32 v23, s27
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v26
+; SI-NEXT:    v_cvt_f16_f32_e32 v26, v9
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v27, s19
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v45
-; SI-NEXT:    v_cvt_f16_f32_e32 v31, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v27, v46
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, v47
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, v56
-; SI-NEXT:    v_cvt_f16_f32_e32 v44, v57
-; SI-NEXT:    v_cvt_f16_f32_e32 v26, v58
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v59
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v60
-; SI-NEXT:    v_cvt_f16_f32_e32 v43, v61
-; SI-NEXT:    v_cvt_f16_f32_e32 v24, v62
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v63
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v32
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v33
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v33, v34
+; SI-NEXT:    v_cvt_f16_f32_e32 v48, v35
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v51
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v36
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v38
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v39
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v63, v50
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v52
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, s16
+; SI-NEXT:    v_cvt_f16_f32_e32 v38, s20
+; SI-NEXT:    v_cvt_f16_f32_e32 v35, s24
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v24
+; SI-NEXT:    v_cvt_f16_f32_e32 v51, v29
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v55
+; SI-NEXT:    v_cvt_f16_f32_e32 v28, v45
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v47
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, v56
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v34
-; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v37
-; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, v48
-; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v51
-; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v54
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_cvt_f16_f32_e32 v51, v55
-; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v40
-; SI-NEXT:    s_waitcnt vmcnt(7) expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v41
-; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, v42
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, s16
-; SI-NEXT:    v_cvt_f16_f32_e32 v57, s18
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, s19
-; SI-NEXT:    v_cvt_f16_f32_e32 v59, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v63, s21
-; SI-NEXT:    v_cvt_f16_f32_e32 v46, s22
-; SI-NEXT:    v_cvt_f16_f32_e32 v47, s23
-; SI-NEXT:    v_cvt_f16_f32_e32 v56, s24
-; SI-NEXT:    v_cvt_f16_f32_e32 v62, s25
-; SI-NEXT:    v_cvt_f16_f32_e32 v45, s28
-; SI-NEXT:    v_cvt_f16_f32_e32 v61, s29
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v57
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, v58
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, s22
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, s25
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, s26
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, s29
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v53
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v41
+; SI-NEXT:    v_cvt_f16_f32_e32 v41, v43
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v59
 ; SI-NEXT:    s_cbranch_scc0 .LBB109_2
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
@@ -233663,510 +235754,600 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i
 ; SI-NEXT:    s_mov_b64 s[4:5], -1
 ; SI-NEXT:  .LBB109_3: ; %Flow
 ; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; SI-NEXT:    v_mov_b32_e32 v41, v2
-; SI-NEXT:    v_mov_b32_e32 v2, v48
+; SI-NEXT:    v_mov_b32_e32 v8, v3
 ; SI-NEXT:    s_cbranch_vccnz .LBB109_5
 ; SI-NEXT:  ; %bb.4: ; %cmp.true
-; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v25
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v39
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v62
+; SI-NEXT:    v_mov_b32_e32 v47, v38
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v27, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_or_b32_e32 v39, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v19
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v36
+; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v49
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, v5
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_or_b32_e32 v36, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v23
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v34
+; SI-NEXT:    v_mov_b32_e32 v45, v35
+; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v23, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v58
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v23
+; SI-NEXT:    v_mov_b32_e32 v57, v33
+; SI-NEXT:    v_or_b32_e32 v34, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v37
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v32
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v53
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v56
+; SI-NEXT:    v_or_b32_e32 v32, v3, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v30
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_mov_b32_e32 v9, v31
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v46
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_lshr_b64 v[58:59], v[33:34], 16
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v44
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v50
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
-; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, v33
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v25
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
-; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; SI-NEXT:    v_cvt_f32_f16_e32 v36, v36
-; SI-NEXT:    v_mov_b32_e32 v42, v44
-; SI-NEXT:    v_or_b32_e32 v44, v1, v48
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v15
-; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v35
-; SI-NEXT:    v_cvt_f32_f16_e32 v39, v39
-; SI-NEXT:    v_or_b32_e32 v2, v2, v48
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
-; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v22
-; SI-NEXT:    v_cvt_f32_f16_e32 v38, v38
-; SI-NEXT:    v_or_b32_e32 v5, v5, v48
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
-; SI-NEXT:    v_add_f32_e32 v31, 0x38000000, v31
-; SI-NEXT:    v_add_f32_e32 v33, 0x38000000, v33
-; SI-NEXT:    v_or_b32_e32 v8, v8, v48
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v33
-; SI-NEXT:    v_add_f32_e32 v32, 0x38000000, v32
-; SI-NEXT:    v_add_f32_e32 v36, 0x38000000, v36
-; SI-NEXT:    v_or_b32_e32 v11, v11, v48
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v36
-; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v35
-; SI-NEXT:    v_add_f32_e32 v39, 0x38000000, v39
-; SI-NEXT:    v_or_b32_e32 v15, v15, v48
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
-; SI-NEXT:    v_add_f32_e32 v38, 0x38000000, v38
-; SI-NEXT:    v_or_b32_e32 v18, v18, v48
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v38, v38
-; SI-NEXT:    v_or_b32_e32 v31, v31, v48
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v33
-; SI-NEXT:    v_or_b32_e32 v32, v32, v48
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v36
-; SI-NEXT:    v_or_b32_e32 v35, v35, v48
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v39
-; SI-NEXT:    v_or_b32_e32 v38, v38, v48
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v50
-; SI-NEXT:    v_cvt_f32_f16_e32 v49, v49
-; SI-NEXT:    v_mov_b32_e32 v1, v51
-; SI-NEXT:    v_cvt_f32_f16_e32 v51, v52
-; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v48
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v48
-; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v49
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT:    v_cvt_f32_f16_e32 v30, v30
-; SI-NEXT:    v_lshlrev_b32_e32 v49, 16, v50
-; SI-NEXT:    v_cvt_f32_f16_e32 v29, v29
-; SI-NEXT:    v_or_b32_e32 v49, v48, v49
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v53
-; SI-NEXT:    v_add_f32_e32 v30, 0x38000000, v30
-; SI-NEXT:    v_cvt_f16_f32_e32 v30, v30
-; SI-NEXT:    v_add_f32_e32 v29, 0x38000000, v29
-; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v48
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v48
-; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v51
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT:    v_cvt_f16_f32_e32 v29, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v51, 16, v53
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    v_or_b32_e32 v52, v48, v51
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v30
-; SI-NEXT:    v_or_b32_e32 v29, v29, v48
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v47
-; SI-NEXT:    v_cvt_f32_f16_e32 v51, v46
-; SI-NEXT:    v_cvt_f32_f16_e32 v54, v59
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_or_b32_e32 v30, v3, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v26
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_or_b32_e32 v26, v3, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v22
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_or_b32_e32 v22, v3, v5
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v18
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v49
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v54
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_or_b32_e32 v18, v3, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v16
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_mov_b32_e32 v54, v15
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v12
+; SI-NEXT:    v_mov_b32_e32 v12, v42
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_or_b32_e32 v16, v3, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v14
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_or_b32_e32 v14, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v51
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v60
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v60
+; SI-NEXT:    v_or_b32_e32 v43, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v10
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v55, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v55
+; SI-NEXT:    v_or_b32_e32 v10, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v48
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v41
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v48, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v48
+; SI-NEXT:    v_or_b32_e32 v41, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
+; SI-NEXT:    v_or_b32_e32 v6, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v63
+; SI-NEXT:    v_lshr_b64 v[62:63], v[38:39], 16
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v48
-; SI-NEXT:    v_cvt_f16_f32_e32 v47, v48
-; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v51
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v51, 16, v47
-; SI-NEXT:    v_add_f32_e32 v54, 0x38000000, v54
-; SI-NEXT:    v_or_b32_e32 v46, v48, v51
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v58
-; SI-NEXT:    v_cvt_f32_f16_e32 v51, v57
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, v54
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v48
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v48
-; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v51
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT:    v_cvt_f32_f16_e32 v55, v61
-; SI-NEXT:    v_lshlrev_b32_e32 v51, 16, v58
-; SI-NEXT:    v_cvt_f32_f16_e32 v40, v45
-; SI-NEXT:    v_or_b32_e32 v57, v48, v51
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v60
-; SI-NEXT:    v_cvt_f32_f16_e32 v51, v63
-; SI-NEXT:    v_add_f32_e32 v55, 0x38000000, v55
-; SI-NEXT:    v_cvt_f16_f32_e32 v55, v55
-; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v48
-; SI-NEXT:    v_add_f32_e32 v51, 0x38000000, v51
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT:    v_cvt_f16_f32_e32 v51, v51
-; SI-NEXT:    v_add_f32_e32 v40, 0x38000000, v40
-; SI-NEXT:    v_cvt_f16_f32_e32 v40, v40
-; SI-NEXT:    v_or_b32_e32 v60, v48, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v62
-; SI-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; SI-NEXT:    v_or_b32_e32 v59, v54, v51
-; SI-NEXT:    v_cvt_f32_f16_e32 v54, v56
-; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v48
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; SI-NEXT:    v_add_f32_e32 v54, 0x38000000, v54
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, v54
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v48
-; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT:    v_or_b32_e32 v56, v54, v48
-; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v55
-; SI-NEXT:    v_cvt_f32_f16_e32 v55, v41
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v45, v40, v54
-; SI-NEXT:    v_cvt_f32_f16_e32 v40, v7
-; SI-NEXT:    v_add_f32_e32 v55, 0x38000000, v55
-; SI-NEXT:    v_cvt_f16_f32_e32 v55, v55
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT:    v_add_f32_e32 v40, 0x38000000, v40
-; SI-NEXT:    v_cvt_f16_f32_e32 v40, v40
-; SI-NEXT:    v_lshlrev_b32_e32 v55, 16, v55
-; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
-; SI-NEXT:    v_cvt_f32_f16_e32 v28, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v40
-; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v49
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v50
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v51, v4
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v51
+; SI-NEXT:    v_or_b32_e32 v4, v3, v4
+; SI-NEXT:    v_mov_b32_e32 v63, v51
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v50
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v28
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v45
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v24
+; SI-NEXT:    v_cvt_f32_f16_e32 v24, v47
+; SI-NEXT:    v_or_b32_e32 v44, v28, v33
+; SI-NEXT:    v_lshr_b64 v[46:47], v[29:30], 16
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v24
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_or_b32_e32 v49, v24, v35
+; SI-NEXT:    v_cvt_f32_f16_e32 v24, v12
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v20
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v11
+; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v24
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v11
+; SI-NEXT:    v_or_b32_e32 v2, v2, v20
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v52
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; SI-NEXT:    v_or_b32_e32 v52, v20, v38
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v61
+; SI-NEXT:    v_or_b32_e32 v61, v24, v29
+; SI-NEXT:    v_mov_b32_e32 v38, v49
+; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; SI-NEXT:    v_or_b32_e32 v37, v20, v31
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v12
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
 ; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
-; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v42
-; SI-NEXT:    v_add_f32_e32 v26, 0x38000000, v26
-; SI-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v37
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
-; SI-NEXT:    v_cvt_f32_f16_e32 v34, v34
-; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
-; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
-; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v12
+; SI-NEXT:    v_or_b32_e32 v12, v28, v25
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v24, v12
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v24
-; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT:    v_add_f32_e32 v23, 0x38000000, v23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
-; SI-NEXT:    v_add_f32_e32 v34, 0x38000000, v34
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v34
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; SI-NEXT:    v_or_b32_e32 v14, v14, v13
-; SI-NEXT:    v_or_b32_e32 v23, v23, v17
-; SI-NEXT:    v_or_b32_e32 v34, v34, v21
-; SI-NEXT:    v_alignbit_b32 v4, v57, v4, 16
-; SI-NEXT:    v_alignbit_b32 v63, v46, v51, 16
-; SI-NEXT:    v_alignbit_b32 v62, v29, v48, 16
-; SI-NEXT:    v_alignbit_b32 v61, v52, v54, 16
-; SI-NEXT:    v_alignbit_b32 v13, v32, v13, 16
-; SI-NEXT:    v_alignbit_b32 v17, v18, v17, 16
-; SI-NEXT:    v_alignbit_b32 v21, v2, v21, 16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v41
-; SI-NEXT:    v_add_f32_e32 v41, 0x38000000, v41
-; SI-NEXT:    v_cvt_f16_f32_e32 v41, v41
-; SI-NEXT:    v_or_b32_e32 v7, v41, v55
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v12
+; SI-NEXT:    v_or_b32_e32 v12, v20, v21
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v41
-; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-NEXT:    v_add_f32_e32 v41, 0x38000000, v41
-; SI-NEXT:    v_cvt_f16_f32_e32 v41, v41
-; SI-NEXT:    v_or_b32_e32 v7, v7, v40
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v12
+; SI-NEXT:    v_or_b32_e32 v12, v24, v17
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_or_b32_e32 v7, v41, v10
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    v_or_b32_e32 v12, v28, v15
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    v_alignbit_b32 v10, v35, v10, 16
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v7
-; SI-NEXT:    v_add_f32_e32 v41, 0x38000000, v41
-; SI-NEXT:    v_cvt_f16_f32_e32 v41, v41
-; SI-NEXT:    v_or_b32_e32 v7, v41, v20
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    v_alignbit_b32 v20, v31, v20, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v24, v12
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v24
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v7
-; SI-NEXT:    v_add_f32_e32 v41, 0x38000000, v41
-; SI-NEXT:    v_cvt_f16_f32_e32 v41, v41
-; SI-NEXT:    v_or_b32_e32 v7, v41, v28
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v12
+; SI-NEXT:    v_or_b32_e32 v12, v20, v13
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    v_alignbit_b32 v28, v15, v28, 16
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v7
-; SI-NEXT:    v_add_f32_e32 v41, 0x38000000, v41
-; SI-NEXT:    v_cvt_f16_f32_e32 v41, v41
-; SI-NEXT:    v_or_b32_e32 v7, v41, v27
-; SI-NEXT:    v_add_f32_e32 v41, 0x38000000, v42
-; SI-NEXT:    v_cvt_f16_f32_e32 v41, v41
-; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v43
-; SI-NEXT:    v_alignbit_b32 v27, v11, v27, 16
+; SI-NEXT:    v_or_b32_e32 v12, v24, v42
+; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v57
+; SI-NEXT:    v_cvt_f32_f16_e32 v24, v54
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_or_b32_e32 v7, v41, v26
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v1
-; SI-NEXT:    v_add_f32_e32 v42, 0x38000000, v42
-; SI-NEXT:    v_cvt_f16_f32_e32 v42, v42
-; SI-NEXT:    v_mov_b32_e32 v1, v44
-; SI-NEXT:    v_add_f32_e32 v41, 0x38000000, v41
-; SI-NEXT:    v_cvt_f16_f32_e32 v41, v41
-; SI-NEXT:    v_mov_b32_e32 v44, v7
-; SI-NEXT:    v_or_b32_e32 v43, v42, v24
-; SI-NEXT:    v_alignbit_b32 v26, v8, v26, 16
-; SI-NEXT:    v_or_b32_e32 v7, v41, v37
-; SI-NEXT:    v_mov_b32_e32 v51, v7
-; SI-NEXT:    v_alignbit_b32 v41, v49, v55, 16
-; SI-NEXT:    v_alignbit_b32 v7, v38, v40, 16
-; SI-NEXT:    v_alignbit_b32 v24, v5, v24, 16
-; SI-NEXT:    v_alignbit_b32 v37, v1, v37, 16
+; SI-NEXT:    v_or_b32_e32 v12, v28, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v8
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
+; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v24
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[56:57], v[31:32], 16
+; SI-NEXT:    v_or_b32_e32 v54, v20, v40
+; SI-NEXT:    v_or_b32_e32 v20, v24, v5
+; SI-NEXT:    v_mov_b32_e32 v33, v54
+; SI-NEXT:    v_mov_b32_e32 v31, v55
+; SI-NEXT:    v_lshr_b64 v[54:55], v[15:16], 16
+; SI-NEXT:    v_mov_b32_e32 v15, v20
+; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v50, v8
+; SI-NEXT:    v_or_b32_e32 v8, v28, v3
+; SI-NEXT:    v_lshr_b64 v[28:29], v[5:6], 16
+; SI-NEXT:    v_add_f32_e32 v50, 0x38000000, v50
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_or_b32_e32 v12, v50, v1
+; SI-NEXT:    v_lshr_b64 v[49:50], v[35:36], 16
+; SI-NEXT:    v_mov_b32_e32 v35, v44
+; SI-NEXT:    v_lshr_b64 v[44:45], v[25:26], 16
+; SI-NEXT:    v_lshr_b64 v[50:51], v[21:22], 16
+; SI-NEXT:    v_lshr_b64 v[24:25], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[20:21], v[42:43], 16
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[20:21], v[9:10], 16
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[20:21], v[40:41], 16
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[12:13], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[24:25], v[3:4], 16
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[20:21], v[1:2], 16
+; SI-NEXT:    v_mov_b32_e32 v42, v61
+; SI-NEXT:    v_mov_b32_e32 v61, v37
+; SI-NEXT:    v_mov_b32_e32 v37, v53
+; SI-NEXT:    v_mov_b32_e32 v51, v43
 ; SI-NEXT:  .LBB109_5: ; %end
-; SI-NEXT:    v_and_b32_e32 v48, 0xffff, v60
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_or_b32_e32 v4, v48, v4
-; SI-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v57
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v58
-; SI-NEXT:    v_or_b32_e32 v4, v4, v48
-; SI-NEXT:    v_add_i32_e32 v48, vcc, 4, v0
-; SI-NEXT:    buffer_store_dword v4, v48, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v59
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v63
-; SI-NEXT:    v_or_b32_e32 v4, v4, v48
-; SI-NEXT:    v_add_i32_e32 v48, vcc, 8, v0
-; SI-NEXT:    buffer_store_dword v4, v48, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v46
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v47
-; SI-NEXT:    v_or_b32_e32 v4, v4, v48
-; SI-NEXT:    v_add_i32_e32 v48, vcc, 12, v0
-; SI-NEXT:    buffer_store_dword v4, v48, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v56
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v62
-; SI-NEXT:    v_or_b32_e32 v4, v4, v48
-; SI-NEXT:    v_add_i32_e32 v48, vcc, 16, v0
-; SI-NEXT:    buffer_store_dword v4, v48, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v30
-; SI-NEXT:    v_or_b32_e32 v4, v4, v29
-; SI-NEXT:    v_add_i32_e32 v29, vcc, 20, v0
-; SI-NEXT:    buffer_store_dword v4, v29, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v45
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v61
-; SI-NEXT:    v_or_b32_e32 v4, v4, v29
-; SI-NEXT:    v_add_i32_e32 v29, vcc, 24, v0
-; SI-NEXT:    buffer_store_dword v4, v29, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v52
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v53
-; SI-NEXT:    v_or_b32_e32 v4, v4, v29
-; SI-NEXT:    v_add_i32_e32 v29, vcc, 28, v0
-; SI-NEXT:    buffer_store_dword v4, v29, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v41
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x74, v0
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT:    v_or_b32_e32 v4, v4, v29
-; SI-NEXT:    v_add_i32_e32 v29, vcc, 32, v0
-; SI-NEXT:    buffer_store_dword v4, v29, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v62
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v52
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v49
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v50
-; SI-NEXT:    v_or_b32_e32 v4, v4, v29
-; SI-NEXT:    v_add_i32_e32 v29, vcc, 36, v0
-; SI-NEXT:    buffer_store_dword v4, v29, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v39
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v27
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 4, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT:    v_or_b32_e32 v4, v4, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 40, v0
-; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v49
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v38
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 8, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v38
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v39
-; SI-NEXT:    v_or_b32_e32 v4, v4, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 44, v0
-; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v36
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v19
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v58
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v35
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v23
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v56
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v61
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v32
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v37
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v46
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v42
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v30
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT:    v_or_b32_e32 v4, v4, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 48, v0
-; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 36, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v35
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v36
-; SI-NEXT:    v_or_b32_e32 v4, v4, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 52, v0
-; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v44
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 40, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v13
-; SI-NEXT:    v_or_b32_e32 v4, v4, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 56, v0
-; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v26
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 44, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v32
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v33
-; SI-NEXT:    v_or_b32_e32 v4, v4, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 60, v0
-; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v50
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v20
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v22
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT:    v_or_b32_e32 v4, v4, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 64, v0
-; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v31
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v22
-; SI-NEXT:    v_or_b32_e32 v4, v4, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x44, v0
-; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v21
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v23
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v17
-; SI-NEXT:    v_or_b32_e32 v4, v4, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x48, v0
-; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v18
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v19
-; SI-NEXT:    v_or_b32_e32 v4, v4, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x4c, v0
-; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v54
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 64, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v28
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT:    v_or_b32_e32 v4, v4, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x50, v0
-; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x44, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v16
-; SI-NEXT:    v_or_b32_e32 v4, v4, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x54, v0
-; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v12
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x48, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v27
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v14
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT:    v_or_b32_e32 v4, v4, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x58, v0
-; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v12
-; SI-NEXT:    v_or_b32_e32 v4, v4, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x5c, v0
-; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v12
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x50, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v44
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v26
-; SI-NEXT:    v_or_b32_e32 v4, v4, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x60, v0
-; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v51
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v60
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x54, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
-; SI-NEXT:    v_or_b32_e32 v4, v4, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x64, v0
-; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v12
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x58, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v43
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v24
-; SI-NEXT:    v_or_b32_e32 v4, v4, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x68, v0
-; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v31
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x5c, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
-; SI-NEXT:    v_or_b32_e32 v4, v4, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x6c, v0
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v33
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v9
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x60, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v34
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v21
-; SI-NEXT:    v_or_b32_e32 v4, v4, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x70, v0
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v41
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v48
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v51
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v37
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v15
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v28
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x6c, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v24
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x70, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v63
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x74, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v20
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x78, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v25
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v11
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x7c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
index 9b28fd9e7b6fd..64b5ecc8f6b8e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
@@ -1209,37 +1209,35 @@ define inreg <8 x i16> @bitcast_v4i32_to_v8i16_scalar(<4 x i32> inreg %a, i32 in
 ; SI-NEXT:    s_cmp_lg_u32 s20, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s19, 16
-; SI-NEXT:    s_lshr_b32 s7, s17, 16
+; SI-NEXT:    s_lshr_b32 s10, s19, 16
+; SI-NEXT:    s_lshr_b32 s11, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB13_3
 ; SI-NEXT:  .LBB13_2: ; %cmp.true
-; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_add_i32 s17, s17, 3
 ; SI-NEXT:    s_add_i32 s16, s16, 3
 ; SI-NEXT:    s_add_i32 s19, s19, 3
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    s_add_i32 s17, s17, 3
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s19, 16
-; SI-NEXT:    s_lshr_b32 s7, s17, 16
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_lshr_b64 s[4:5], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[16:17], 16
+; SI-NEXT:    s_lshr_b32 s10, s19, 16
+; SI-NEXT:    s_lshr_b32 s11, s17, 16
 ; SI-NEXT:  .LBB13_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
+; SI-NEXT:    v_mov_b32_e32 v1, s6
 ; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v3, s7
+; SI-NEXT:    v_mov_b32_e32 v3, s11
 ; SI-NEXT:    v_mov_b32_e32 v4, s18
+; SI-NEXT:    v_mov_b32_e32 v5, s4
 ; SI-NEXT:    v_mov_b32_e32 v6, s19
-; SI-NEXT:    v_mov_b32_e32 v7, s6
+; SI-NEXT:    v_mov_b32_e32 v7, s10
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB13_4:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr11
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr10
 ; SI-NEXT:    s_branch .LBB13_2
 ;
 ; VI-LABEL: bitcast_v4i32_to_v8i16_scalar:
@@ -3544,65 +3542,67 @@ define inreg <16 x i8> @bitcast_v4i32_to_v16i8_scalar(<4 x i32> inreg %a, i32 in
 ; SI-NEXT:    s_cmp_lg_u32 s20, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB25_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v11, s19, v0, 24
-; SI-NEXT:    v_alignbit_b32 v10, s19, v0, 16
-; SI-NEXT:    v_alignbit_b32 v9, s19, v0, 8
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v3, s17, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, s17, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 8
-; SI-NEXT:    s_lshr_b32 s6, s19, 24
-; SI-NEXT:    s_lshr_b32 s7, s19, 16
-; SI-NEXT:    s_lshr_b32 s8, s19, 8
-; SI-NEXT:    s_lshr_b32 s9, s17, 24
-; SI-NEXT:    s_lshr_b32 s10, s17, 16
-; SI-NEXT:    s_lshr_b32 s11, s17, 8
+; SI-NEXT:    s_lshr_b32 s22, s19, 24
+; SI-NEXT:    s_lshr_b32 s23, s19, 16
+; SI-NEXT:    s_lshr_b32 s24, s19, 8
+; SI-NEXT:    s_lshr_b32 s25, s17, 24
+; SI-NEXT:    s_lshr_b32 s26, s17, 16
+; SI-NEXT:    s_lshr_b32 s27, s17, 8
+; SI-NEXT:    s_lshr_b64 s[4:5], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[6:7], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[10:11], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[12:13], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[16:17], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB25_3
 ; SI-NEXT:  .LBB25_2: ; %cmp.true
-; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_add_i32 s17, s17, 3
 ; SI-NEXT:    s_add_i32 s16, s16, 3
 ; SI-NEXT:    s_add_i32 s19, s19, 3
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    s_add_i32 s17, s17, 3
-; SI-NEXT:    v_alignbit_b32 v11, s19, v0, 24
-; SI-NEXT:    v_alignbit_b32 v10, s19, v0, 16
-; SI-NEXT:    v_alignbit_b32 v9, s19, v0, 8
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v3, s17, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, s17, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 8
-; SI-NEXT:    s_lshr_b32 s6, s19, 24
-; SI-NEXT:    s_lshr_b32 s7, s19, 16
-; SI-NEXT:    s_lshr_b32 s8, s19, 8
-; SI-NEXT:    s_lshr_b32 s9, s17, 24
-; SI-NEXT:    s_lshr_b32 s10, s17, 16
-; SI-NEXT:    s_lshr_b32 s11, s17, 8
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_lshr_b64 s[4:5], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[6:7], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[10:11], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[12:13], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[16:17], 8
+; SI-NEXT:    s_lshr_b32 s22, s19, 24
+; SI-NEXT:    s_lshr_b32 s23, s19, 16
+; SI-NEXT:    s_lshr_b32 s24, s19, 8
+; SI-NEXT:    s_lshr_b32 s25, s17, 24
+; SI-NEXT:    s_lshr_b32 s26, s17, 16
+; SI-NEXT:    s_lshr_b32 s27, s17, 8
 ; SI-NEXT:  .LBB25_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
+; SI-NEXT:    v_mov_b32_e32 v1, s14
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    v_mov_b32_e32 v3, s10
 ; SI-NEXT:    v_mov_b32_e32 v4, s17
-; SI-NEXT:    v_mov_b32_e32 v5, s11
-; SI-NEXT:    v_mov_b32_e32 v6, s10
-; SI-NEXT:    v_mov_b32_e32 v7, s9
+; SI-NEXT:    v_mov_b32_e32 v5, s27
+; SI-NEXT:    v_mov_b32_e32 v6, s26
+; SI-NEXT:    v_mov_b32_e32 v7, s25
 ; SI-NEXT:    v_mov_b32_e32 v8, s18
+; SI-NEXT:    v_mov_b32_e32 v9, s8
+; SI-NEXT:    v_mov_b32_e32 v10, s6
+; SI-NEXT:    v_mov_b32_e32 v11, s4
 ; SI-NEXT:    v_mov_b32_e32 v12, s19
-; SI-NEXT:    v_mov_b32_e32 v13, s8
-; SI-NEXT:    v_mov_b32_e32 v14, s7
-; SI-NEXT:    v_mov_b32_e32 v15, s6
+; SI-NEXT:    v_mov_b32_e32 v13, s24
+; SI-NEXT:    v_mov_b32_e32 v14, s23
+; SI-NEXT:    v_mov_b32_e32 v15, s22
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB25_4:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $sgpr11
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr12
 ; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $sgpr27
+; SI-NEXT:    ; implicit-def: $sgpr26
+; SI-NEXT:    ; implicit-def: $sgpr25
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $sgpr7
 ; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr24
+; SI-NEXT:    ; implicit-def: $sgpr23
+; SI-NEXT:    ; implicit-def: $sgpr22
 ; SI-NEXT:    s_branch .LBB25_2
 ;
 ; VI-LABEL: bitcast_v4i32_to_v16i8_scalar:
@@ -5664,36 +5664,41 @@ define inreg <8 x i16> @bitcast_v4f32_to_v8i16_scalar(<4 x float> inreg %a, i32
 ; SI-NEXT:    s_cmp_lg_u32 s20, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB37_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s7, s19, 16
-; SI-NEXT:    s_lshr_b32 s6, s17, 16
+; SI-NEXT:    s_lshr_b32 s11, s19, 16
+; SI-NEXT:    s_lshr_b32 s10, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB37_4
 ; SI-NEXT:  .LBB37_2: ; %cmp.true
-; SI-NEXT:    v_add_f32_e64 v2, s17, 1.0
-; SI-NEXT:    v_add_f32_e64 v0, s16, 1.0
-; SI-NEXT:    v_add_f32_e64 v6, s19, 1.0
-; SI-NEXT:    v_add_f32_e64 v4, s18, 1.0
-; SI-NEXT:    v_alignbit_b32 v5, v6, v4, 16
-; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-NEXT:    s_setpc_b64 s[30:31]
+; SI-NEXT:    v_add_f32_e64 v11, s17, 1.0
+; SI-NEXT:    v_add_f32_e64 v10, s16, 1.0
+; SI-NEXT:    v_add_f32_e64 v9, s19, 1.0
+; SI-NEXT:    v_add_f32_e64 v8, s18, 1.0
+; SI-NEXT:    v_lshr_b64 v[5:6], v[8:9], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[10:11], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v11
+; SI-NEXT:    s_branch .LBB37_5
 ; SI-NEXT:  .LBB37_3:
-; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $sgpr7
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr11
 ; SI-NEXT:    s_branch .LBB37_2
 ; SI-NEXT:  .LBB37_4:
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v4, s18
-; SI-NEXT:    v_mov_b32_e32 v6, s19
-; SI-NEXT:    v_mov_b32_e32 v3, s6
-; SI-NEXT:    v_mov_b32_e32 v7, s7
+; SI-NEXT:    v_mov_b32_e32 v10, s16
+; SI-NEXT:    v_mov_b32_e32 v11, s17
+; SI-NEXT:    v_mov_b32_e32 v8, s18
+; SI-NEXT:    v_mov_b32_e32 v9, s19
+; SI-NEXT:    v_mov_b32_e32 v3, s10
+; SI-NEXT:    v_mov_b32_e32 v7, s11
+; SI-NEXT:    v_mov_b32_e32 v5, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s6
+; SI-NEXT:  .LBB37_5: ; %end
+; SI-NEXT:    v_mov_b32_e32 v0, v10
+; SI-NEXT:    v_mov_b32_e32 v2, v11
+; SI-NEXT:    v_mov_b32_e32 v4, v8
+; SI-NEXT:    v_mov_b32_e32 v6, v9
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v4f32_to_v8i16_scalar:
@@ -7997,64 +8002,75 @@ define inreg <16 x i8> @bitcast_v4f32_to_v16i8_scalar(<4 x float> inreg %a, i32
 ; SI-NEXT:    s_cmp_lg_u32 s20, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB49_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v11, s19, v0, 24
-; SI-NEXT:    v_alignbit_b32 v10, s19, v0, 16
-; SI-NEXT:    v_alignbit_b32 v9, s19, v0, 8
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v3, s17, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, s17, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 8
-; SI-NEXT:    s_lshr_b32 s9, s19, 24
-; SI-NEXT:    s_lshr_b32 s10, s19, 16
-; SI-NEXT:    s_lshr_b32 s11, s19, 8
-; SI-NEXT:    s_lshr_b32 s6, s17, 24
-; SI-NEXT:    s_lshr_b32 s7, s17, 16
-; SI-NEXT:    s_lshr_b32 s8, s17, 8
+; SI-NEXT:    s_lshr_b32 s24, s19, 24
+; SI-NEXT:    s_lshr_b32 s26, s19, 16
+; SI-NEXT:    s_lshr_b32 s27, s19, 8
+; SI-NEXT:    s_lshr_b32 s22, s17, 24
+; SI-NEXT:    s_lshr_b32 s23, s17, 16
+; SI-NEXT:    s_lshr_b32 s25, s17, 8
+; SI-NEXT:    s_lshr_b64 s[10:11], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[12:13], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[4:5], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[6:7], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[16:17], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB49_4
 ; SI-NEXT:  .LBB49_2: ; %cmp.true
-; SI-NEXT:    v_add_f32_e64 v4, s17, 1.0
-; SI-NEXT:    v_add_f32_e64 v0, s16, 1.0
-; SI-NEXT:    v_add_f32_e64 v12, s19, 1.0
-; SI-NEXT:    v_add_f32_e64 v8, s18, 1.0
-; SI-NEXT:    v_alignbit_b32 v11, v12, v8, 24
-; SI-NEXT:    v_alignbit_b32 v10, v12, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, v12, v8, 8
-; SI-NEXT:    v_alignbit_b32 v3, v4, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, v4, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, v4, v0, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; SI-NEXT:    s_setpc_b64 s[30:31]
+; SI-NEXT:    v_add_f32_e64 v21, s17, 1.0
+; SI-NEXT:    v_add_f32_e64 v20, s16, 1.0
+; SI-NEXT:    v_add_f32_e64 v19, s19, 1.0
+; SI-NEXT:    v_add_f32_e64 v18, s18, 1.0
+; SI-NEXT:    v_lshr_b64 v[0:1], v[20:21], 16
+; SI-NEXT:    v_lshr_b64 v[11:12], v[18:19], 24
+; SI-NEXT:    v_lshr_b64 v[16:17], v[18:19], 16
+; SI-NEXT:    v_lshr_b64 v[9:10], v[18:19], 8
+; SI-NEXT:    v_lshr_b64 v[3:4], v[20:21], 24
+; SI-NEXT:    v_lshr_b64 v[1:2], v[20:21], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v21
+; SI-NEXT:    s_branch .LBB49_5
 ; SI-NEXT:  .LBB49_3:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $sgpr7
 ; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $sgpr11
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr25
+; SI-NEXT:    ; implicit-def: $sgpr23
+; SI-NEXT:    ; implicit-def: $sgpr22
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr12
 ; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $sgpr9
+; SI-NEXT:    ; implicit-def: $sgpr27
+; SI-NEXT:    ; implicit-def: $sgpr26
+; SI-NEXT:    ; implicit-def: $sgpr24
 ; SI-NEXT:    s_branch .LBB49_2
 ; SI-NEXT:  .LBB49_4:
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_mov_b32_e32 v4, s17
-; SI-NEXT:    v_mov_b32_e32 v8, s18
-; SI-NEXT:    v_mov_b32_e32 v12, s19
-; SI-NEXT:    v_mov_b32_e32 v5, s8
-; SI-NEXT:    v_mov_b32_e32 v6, s7
-; SI-NEXT:    v_mov_b32_e32 v7, s6
-; SI-NEXT:    v_mov_b32_e32 v13, s11
-; SI-NEXT:    v_mov_b32_e32 v14, s10
-; SI-NEXT:    v_mov_b32_e32 v15, s9
+; SI-NEXT:    v_mov_b32_e32 v20, s16
+; SI-NEXT:    v_mov_b32_e32 v21, s17
+; SI-NEXT:    v_mov_b32_e32 v18, s18
+; SI-NEXT:    v_mov_b32_e32 v19, s19
+; SI-NEXT:    v_mov_b32_e32 v5, s25
+; SI-NEXT:    v_mov_b32_e32 v6, s23
+; SI-NEXT:    v_mov_b32_e32 v7, s22
+; SI-NEXT:    v_mov_b32_e32 v13, s27
+; SI-NEXT:    v_mov_b32_e32 v14, s26
+; SI-NEXT:    v_mov_b32_e32 v15, s24
+; SI-NEXT:    v_mov_b32_e32 v11, s10
+; SI-NEXT:    v_mov_b32_e32 v16, s12
+; SI-NEXT:    v_mov_b32_e32 v9, s14
+; SI-NEXT:    v_mov_b32_e32 v3, s4
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s8
+; SI-NEXT:  .LBB49_5: ; %end
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    v_mov_b32_e32 v0, v20
+; SI-NEXT:    v_mov_b32_e32 v4, v21
+; SI-NEXT:    v_mov_b32_e32 v8, v18
+; SI-NEXT:    v_mov_b32_e32 v10, v16
+; SI-NEXT:    v_mov_b32_e32 v12, v19
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v4f32_to_v16i8_scalar:
@@ -9769,37 +9785,35 @@ define inreg <8 x i16> @bitcast_v2i64_to_v8i16_scalar(<2 x i64> inreg %a, i32 in
 ; SI-NEXT:    s_cmp_lg_u32 s20, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB57_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s19, 16
-; SI-NEXT:    s_lshr_b32 s7, s17, 16
+; SI-NEXT:    s_lshr_b32 s10, s19, 16
+; SI-NEXT:    s_lshr_b32 s11, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB57_3
 ; SI-NEXT:  .LBB57_2: ; %cmp.true
-; SI-NEXT:    s_add_u32 s16, s16, 3
-; SI-NEXT:    s_addc_u32 s17, s17, 0
 ; SI-NEXT:    s_add_u32 s18, s18, 3
 ; SI-NEXT:    s_addc_u32 s19, s19, 0
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s19, 16
-; SI-NEXT:    s_lshr_b32 s7, s17, 16
+; SI-NEXT:    s_add_u32 s16, s16, 3
+; SI-NEXT:    s_addc_u32 s17, s17, 0
+; SI-NEXT:    s_lshr_b32 s10, s19, 16
+; SI-NEXT:    s_lshr_b32 s11, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[16:17], 16
 ; SI-NEXT:  .LBB57_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
+; SI-NEXT:    v_mov_b32_e32 v1, s6
 ; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v3, s7
+; SI-NEXT:    v_mov_b32_e32 v3, s11
 ; SI-NEXT:    v_mov_b32_e32 v4, s18
+; SI-NEXT:    v_mov_b32_e32 v5, s4
 ; SI-NEXT:    v_mov_b32_e32 v6, s19
-; SI-NEXT:    v_mov_b32_e32 v7, s6
+; SI-NEXT:    v_mov_b32_e32 v7, s10
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB57_4:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr11
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr10
 ; SI-NEXT:    s_branch .LBB57_2
 ;
 ; VI-LABEL: bitcast_v2i64_to_v8i16_scalar:
@@ -12106,65 +12120,67 @@ define inreg <16 x i8> @bitcast_v2i64_to_v16i8_scalar(<2 x i64> inreg %a, i32 in
 ; SI-NEXT:    s_cmp_lg_u32 s20, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB69_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v11, s19, v0, 24
-; SI-NEXT:    v_alignbit_b32 v10, s19, v0, 16
-; SI-NEXT:    v_alignbit_b32 v9, s19, v0, 8
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v3, s17, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, s17, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 8
-; SI-NEXT:    s_lshr_b32 s6, s19, 24
-; SI-NEXT:    s_lshr_b32 s7, s19, 16
-; SI-NEXT:    s_lshr_b32 s8, s19, 8
-; SI-NEXT:    s_lshr_b32 s9, s17, 24
-; SI-NEXT:    s_lshr_b32 s10, s17, 16
-; SI-NEXT:    s_lshr_b32 s11, s17, 8
+; SI-NEXT:    s_lshr_b32 s22, s19, 24
+; SI-NEXT:    s_lshr_b32 s23, s19, 16
+; SI-NEXT:    s_lshr_b32 s24, s19, 8
+; SI-NEXT:    s_lshr_b32 s25, s17, 24
+; SI-NEXT:    s_lshr_b32 s26, s17, 16
+; SI-NEXT:    s_lshr_b32 s27, s17, 8
+; SI-NEXT:    s_lshr_b64 s[4:5], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[6:7], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[10:11], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[12:13], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[16:17], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB69_3
 ; SI-NEXT:  .LBB69_2: ; %cmp.true
-; SI-NEXT:    s_add_u32 s16, s16, 3
-; SI-NEXT:    s_addc_u32 s17, s17, 0
 ; SI-NEXT:    s_add_u32 s18, s18, 3
 ; SI-NEXT:    s_addc_u32 s19, s19, 0
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v11, s19, v0, 24
-; SI-NEXT:    v_alignbit_b32 v10, s19, v0, 16
-; SI-NEXT:    v_alignbit_b32 v9, s19, v0, 8
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v3, s17, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, s17, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 8
-; SI-NEXT:    s_lshr_b32 s6, s19, 24
-; SI-NEXT:    s_lshr_b32 s7, s19, 16
-; SI-NEXT:    s_lshr_b32 s8, s19, 8
-; SI-NEXT:    s_lshr_b32 s9, s17, 24
-; SI-NEXT:    s_lshr_b32 s10, s17, 16
-; SI-NEXT:    s_lshr_b32 s11, s17, 8
+; SI-NEXT:    s_add_u32 s16, s16, 3
+; SI-NEXT:    s_addc_u32 s17, s17, 0
+; SI-NEXT:    s_lshr_b32 s22, s19, 24
+; SI-NEXT:    s_lshr_b32 s23, s19, 16
+; SI-NEXT:    s_lshr_b32 s24, s19, 8
+; SI-NEXT:    s_lshr_b32 s25, s17, 24
+; SI-NEXT:    s_lshr_b32 s26, s17, 16
+; SI-NEXT:    s_lshr_b32 s27, s17, 8
+; SI-NEXT:    s_lshr_b64 s[4:5], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[6:7], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[10:11], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[12:13], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[16:17], 8
 ; SI-NEXT:  .LBB69_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
+; SI-NEXT:    v_mov_b32_e32 v1, s14
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    v_mov_b32_e32 v3, s10
 ; SI-NEXT:    v_mov_b32_e32 v4, s17
-; SI-NEXT:    v_mov_b32_e32 v5, s11
-; SI-NEXT:    v_mov_b32_e32 v6, s10
-; SI-NEXT:    v_mov_b32_e32 v7, s9
+; SI-NEXT:    v_mov_b32_e32 v5, s27
+; SI-NEXT:    v_mov_b32_e32 v6, s26
+; SI-NEXT:    v_mov_b32_e32 v7, s25
 ; SI-NEXT:    v_mov_b32_e32 v8, s18
+; SI-NEXT:    v_mov_b32_e32 v9, s8
+; SI-NEXT:    v_mov_b32_e32 v10, s6
+; SI-NEXT:    v_mov_b32_e32 v11, s4
 ; SI-NEXT:    v_mov_b32_e32 v12, s19
-; SI-NEXT:    v_mov_b32_e32 v13, s8
-; SI-NEXT:    v_mov_b32_e32 v14, s7
-; SI-NEXT:    v_mov_b32_e32 v15, s6
+; SI-NEXT:    v_mov_b32_e32 v13, s24
+; SI-NEXT:    v_mov_b32_e32 v14, s23
+; SI-NEXT:    v_mov_b32_e32 v15, s22
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB69_4:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $sgpr11
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr12
 ; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $sgpr27
+; SI-NEXT:    ; implicit-def: $sgpr26
+; SI-NEXT:    ; implicit-def: $sgpr25
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $sgpr7
 ; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr24
+; SI-NEXT:    ; implicit-def: $sgpr23
+; SI-NEXT:    ; implicit-def: $sgpr22
 ; SI-NEXT:    s_branch .LBB69_2
 ;
 ; VI-LABEL: bitcast_v2i64_to_v16i8_scalar:
@@ -13498,34 +13514,34 @@ define inreg <8 x i16> @bitcast_v2f64_to_v8i16_scalar(<2 x double> inreg %a, i32
 ; SI-NEXT:    s_cmp_lg_u32 s20, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB73_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s7, s19, 16
-; SI-NEXT:    s_lshr_b32 s6, s17, 16
+; SI-NEXT:    s_lshr_b32 s11, s19, 16
+; SI-NEXT:    s_lshr_b32 s10, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB73_4
 ; SI-NEXT:  .LBB73_2: ; %cmp.true
 ; SI-NEXT:    v_add_f64 v[8:9], s[18:19], 1.0
 ; SI-NEXT:    v_add_f64 v[10:11], s[16:17], 1.0
-; SI-NEXT:    v_alignbit_b32 v5, v9, v8, 16
-; SI-NEXT:    v_alignbit_b32 v1, v11, v10, 16
+; SI-NEXT:    v_lshr_b64 v[5:6], v[8:9], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[10:11], 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v9
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v11
 ; SI-NEXT:    s_branch .LBB73_5
 ; SI-NEXT:  .LBB73_3:
-; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $sgpr7
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr11
 ; SI-NEXT:    s_branch .LBB73_2
 ; SI-NEXT:  .LBB73_4:
-; SI-NEXT:    v_mov_b32_e32 v11, s17
 ; SI-NEXT:    v_mov_b32_e32 v9, s19
-; SI-NEXT:    v_mov_b32_e32 v8, s18
+; SI-NEXT:    v_mov_b32_e32 v11, s17
 ; SI-NEXT:    v_mov_b32_e32 v10, s16
-; SI-NEXT:    v_mov_b32_e32 v3, s6
-; SI-NEXT:    v_mov_b32_e32 v7, s7
+; SI-NEXT:    v_mov_b32_e32 v8, s18
+; SI-NEXT:    v_mov_b32_e32 v7, s11
+; SI-NEXT:    v_mov_b32_e32 v3, s10
+; SI-NEXT:    v_mov_b32_e32 v1, s6
+; SI-NEXT:    v_mov_b32_e32 v5, s4
 ; SI-NEXT:  .LBB73_5: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, v10
 ; SI-NEXT:    v_mov_b32_e32 v2, v11
@@ -15789,67 +15805,73 @@ define inreg <16 x i8> @bitcast_v2f64_to_v16i8_scalar(<2 x double> inreg %a, i32
 ; SI-NEXT:    s_cmp_lg_u32 s20, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB85_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v11, s19, v0, 24
-; SI-NEXT:    v_alignbit_b32 v10, s19, v0, 16
-; SI-NEXT:    v_alignbit_b32 v9, s19, v0, 8
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v3, s17, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, s17, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 8
-; SI-NEXT:    s_lshr_b32 s11, s19, 24
-; SI-NEXT:    s_lshr_b32 s10, s19, 16
-; SI-NEXT:    s_lshr_b32 s9, s19, 8
-; SI-NEXT:    s_lshr_b32 s8, s17, 24
-; SI-NEXT:    s_lshr_b32 s7, s17, 16
-; SI-NEXT:    s_lshr_b32 s6, s17, 8
+; SI-NEXT:    s_lshr_b32 s27, s19, 24
+; SI-NEXT:    s_lshr_b32 s26, s19, 16
+; SI-NEXT:    s_lshr_b32 s25, s19, 8
+; SI-NEXT:    s_lshr_b32 s24, s17, 24
+; SI-NEXT:    s_lshr_b32 s23, s17, 16
+; SI-NEXT:    s_lshr_b32 s22, s17, 8
+; SI-NEXT:    s_lshr_b64 s[4:5], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[6:7], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[10:11], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[12:13], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[16:17], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB85_4
 ; SI-NEXT:  .LBB85_2: ; %cmp.true
-; SI-NEXT:    v_add_f64 v[16:17], s[18:19], 1.0
-; SI-NEXT:    v_add_f64 v[18:19], s[16:17], 1.0
-; SI-NEXT:    v_alignbit_b32 v11, v17, v16, 24
-; SI-NEXT:    v_alignbit_b32 v10, v17, v16, 16
-; SI-NEXT:    v_alignbit_b32 v9, v17, v16, 8
-; SI-NEXT:    v_alignbit_b32 v3, v19, v18, 24
-; SI-NEXT:    v_alignbit_b32 v2, v19, v18, 16
-; SI-NEXT:    v_alignbit_b32 v1, v19, v18, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v19
+; SI-NEXT:    v_add_f64 v[20:21], s[16:17], 1.0
+; SI-NEXT:    v_add_f64 v[18:19], s[18:19], 1.0
+; SI-NEXT:    v_lshr_b64 v[0:1], v[20:21], 16
+; SI-NEXT:    v_lshr_b64 v[11:12], v[18:19], 24
+; SI-NEXT:    v_lshr_b64 v[16:17], v[18:19], 16
+; SI-NEXT:    v_lshr_b64 v[9:10], v[18:19], 8
+; SI-NEXT:    v_lshr_b64 v[3:4], v[20:21], 24
+; SI-NEXT:    v_lshr_b64 v[1:2], v[20:21], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v21
 ; SI-NEXT:    s_branch .LBB85_5
 ; SI-NEXT:  .LBB85_3:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $sgpr9
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr12
 ; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $sgpr11
+; SI-NEXT:    ; implicit-def: $sgpr22
+; SI-NEXT:    ; implicit-def: $sgpr23
+; SI-NEXT:    ; implicit-def: $sgpr24
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr25
+; SI-NEXT:    ; implicit-def: $sgpr26
+; SI-NEXT:    ; implicit-def: $sgpr27
 ; SI-NEXT:    s_branch .LBB85_2
 ; SI-NEXT:  .LBB85_4:
-; SI-NEXT:    v_mov_b32_e32 v19, s17
-; SI-NEXT:    v_mov_b32_e32 v17, s19
-; SI-NEXT:    v_mov_b32_e32 v16, s18
-; SI-NEXT:    v_mov_b32_e32 v18, s16
-; SI-NEXT:    v_mov_b32_e32 v7, s8
-; SI-NEXT:    v_mov_b32_e32 v6, s7
-; SI-NEXT:    v_mov_b32_e32 v5, s6
-; SI-NEXT:    v_mov_b32_e32 v15, s11
-; SI-NEXT:    v_mov_b32_e32 v14, s10
-; SI-NEXT:    v_mov_b32_e32 v13, s9
+; SI-NEXT:    v_mov_b32_e32 v19, s19
+; SI-NEXT:    v_mov_b32_e32 v21, s17
+; SI-NEXT:    v_mov_b32_e32 v20, s16
+; SI-NEXT:    v_mov_b32_e32 v18, s18
+; SI-NEXT:    v_mov_b32_e32 v15, s27
+; SI-NEXT:    v_mov_b32_e32 v14, s26
+; SI-NEXT:    v_mov_b32_e32 v13, s25
+; SI-NEXT:    v_mov_b32_e32 v7, s24
+; SI-NEXT:    v_mov_b32_e32 v6, s23
+; SI-NEXT:    v_mov_b32_e32 v5, s22
+; SI-NEXT:    v_mov_b32_e32 v1, s14
+; SI-NEXT:    v_mov_b32_e32 v0, s12
+; SI-NEXT:    v_mov_b32_e32 v3, s10
+; SI-NEXT:    v_mov_b32_e32 v9, s8
+; SI-NEXT:    v_mov_b32_e32 v16, s6
+; SI-NEXT:    v_mov_b32_e32 v11, s4
 ; SI-NEXT:  .LBB85_5: ; %end
-; SI-NEXT:    v_mov_b32_e32 v0, v18
-; SI-NEXT:    v_mov_b32_e32 v4, v19
-; SI-NEXT:    v_mov_b32_e32 v8, v16
-; SI-NEXT:    v_mov_b32_e32 v12, v17
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    v_mov_b32_e32 v0, v20
+; SI-NEXT:    v_mov_b32_e32 v4, v21
+; SI-NEXT:    v_mov_b32_e32 v8, v18
+; SI-NEXT:    v_mov_b32_e32 v10, v16
+; SI-NEXT:    v_mov_b32_e32 v12, v19
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v2f64_to_v16i8_scalar:
@@ -17515,11 +17537,11 @@ define inreg <8 x i16> @bitcast_v8f16_to_v8i16_scalar(<8 x half> inreg %a, i32 i
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, s16
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, s17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, s18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, s19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, s21
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s23
 ; SI-NEXT:    s_cmp_lg_u32 s24, 0
@@ -17530,8 +17552,8 @@ define inreg <8 x i16> @bitcast_v8f16_to_v8i16_scalar(<8 x half> inreg %a, i32 i
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
@@ -17545,10 +17567,10 @@ define inreg <8 x i16> @bitcast_v8f16_to_v8i16_scalar(<8 x half> inreg %a, i32 i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
@@ -17557,11 +17579,13 @@ define inreg <8 x i16> @bitcast_v8f16_to_v8i16_scalar(<8 x half> inreg %a, i32 i
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v8
+; SI-NEXT:    v_lshr_b64 v[10:11], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[8:9], v[5:6], 16
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v5
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v5, 16
 ; SI-NEXT:  .LBB91_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v1, v10
+; SI-NEXT:    v_mov_b32_e32 v5, v8
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB91_4:
 ; SI-NEXT:    s_branch .LBB91_2
@@ -18405,60 +18429,62 @@ define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i3
 ; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s16
 ; SI-NEXT:    v_mul_f32_e64 v14, 1.0, s17
 ; SI-NEXT:    v_mul_f32_e64 v9, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s19
 ; SI-NEXT:    v_mul_f32_e64 v13, 1.0, s20
 ; SI-NEXT:    v_mul_f32_e64 v12, 1.0, s21
 ; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s22
-; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s23
 ; SI-NEXT:    s_cbranch_scc0 .LBB95_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v14
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v12
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
 ; SI-NEXT:    s_cbranch_execnz .LBB95_3
 ; SI-NEXT:  .LBB95_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v14
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v14
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v15
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
 ; SI-NEXT:    v_alignbit_b32 v0, v2, v0, 16
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v13
-; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v12
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v10
-; SI-NEXT:    v_alignbit_b32 v4, v4, v2, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v11
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v11
 ; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
+; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v8
-; SI-NEXT:    v_alignbit_b32 v6, v7, v2, 16
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v9
-; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_alignbit_b32 v6, v7, v3, 16
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v5, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; SI-NEXT:    v_lshr_b64 v[10:11], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[8:9], v[5:6], 16
+; SI-NEXT:    v_alignbit_b32 v4, v12, v13, 16
 ; SI-NEXT:  .LBB95_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v1, v10
+; SI-NEXT:    v_mov_b32_e32 v5, v8
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB95_4:
 ; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr10
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $vgpr8
 ; SI-NEXT:    ; implicit-def: $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr7
 ; SI-NEXT:    s_branch .LBB95_2
@@ -19152,30 +19178,28 @@ define inreg <16 x i8> @bitcast_v8i16_to_v16i8_scalar(<8 x i16> inreg %a, i32 in
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s17, 16
-; SI-NEXT:    s_or_b32 s6, s4, s5
-; SI-NEXT:    s_and_b32 s4, s18, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s19, 16
-; SI-NEXT:    s_or_b32 s8, s4, s5
-; SI-NEXT:    s_and_b32 s4, s20, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s21, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    s_or_b32 s7, s4, s5
-; SI-NEXT:    s_and_b32 s4, s22, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s23, 16
-; SI-NEXT:    v_alignbit_b32 v3, s8, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, s8, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, s8, v0, 8
-; SI-NEXT:    s_or_b32 s9, s4, s5
-; SI-NEXT:    v_mov_b32_e32 v0, s7
-; SI-NEXT:    v_alignbit_b32 v11, s9, v0, 24
-; SI-NEXT:    v_alignbit_b32 v10, s9, v0, 16
-; SI-NEXT:    v_alignbit_b32 v9, s9, v0, 8
-; SI-NEXT:    s_lshr_b32 s10, s8, 8
-; SI-NEXT:    s_lshr_b32 s13, s9, 8
+; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    s_and_b32 s5, s18, 0xffff
+; SI-NEXT:    s_lshl_b32 s6, s19, 16
+; SI-NEXT:    s_or_b32 s5, s5, s6
+; SI-NEXT:    s_lshr_b64 s[8:9], s[4:5], 24
+; SI-NEXT:    s_and_b32 s6, s20, 0xffff
+; SI-NEXT:    s_lshl_b32 s7, s21, 16
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    s_and_b32 s7, s22, 0xffff
+; SI-NEXT:    s_lshl_b32 s9, s23, 16
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    s_lshr_b64 s[10:11], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[4:5], 8
+; SI-NEXT:    s_lshr_b64 s[14:15], s[6:7], 24
+; SI-NEXT:    s_lshr_b64 s[24:25], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[26:27], s[6:7], 8
+; SI-NEXT:    s_lshr_b32 s9, s5, 8
+; SI-NEXT:    s_lshr_b32 s15, s7, 8
 ; SI-NEXT:    s_and_b32 s11, s19, 0xffff
-; SI-NEXT:    s_and_b32 s14, s23, 0xffff
-; SI-NEXT:    s_bfe_u32 s12, s19, 0x80008
-; SI-NEXT:    s_bfe_u32 s15, s23, 0x80008
+; SI-NEXT:    s_and_b32 s25, s23, 0xffff
+; SI-NEXT:    s_bfe_u32 s13, s19, 0x80008
+; SI-NEXT:    s_bfe_u32 s27, s23, 0x80008
 ; SI-NEXT:    s_cbranch_execnz .LBB97_3
 ; SI-NEXT:  .LBB97_2: ; %cmp.true
 ; SI-NEXT:    s_add_i32 s20, s20, 3
@@ -19183,64 +19207,66 @@ define inreg <16 x i8> @bitcast_v8i16_to_v16i8_scalar(<8 x i16> inreg %a, i32 in
 ; SI-NEXT:    s_lshl_b32 s5, s21, 16
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s22, s22, 3
-; SI-NEXT:    s_add_i32 s7, s4, 0x30000
+; SI-NEXT:    s_add_i32 s6, s4, 0x30000
 ; SI-NEXT:    s_and_b32 s4, s22, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s23, 16
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s16, s16, 3
-; SI-NEXT:    s_add_i32 s9, s4, 0x30000
+; SI-NEXT:    s_add_i32 s7, s4, 0x30000
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s17, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s18, s18, 3
-; SI-NEXT:    s_add_i32 s6, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s18, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s19, 16
 ; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s8, s4, 0x30000
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_alignbit_b32 v3, s8, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, s8, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, s8, v0, 8
-; SI-NEXT:    v_mov_b32_e32 v0, s7
-; SI-NEXT:    v_alignbit_b32 v11, s9, v0, 24
-; SI-NEXT:    v_alignbit_b32 v10, s9, v0, 16
-; SI-NEXT:    v_alignbit_b32 v9, s9, v0, 8
-; SI-NEXT:    s_lshr_b32 s12, s8, 24
-; SI-NEXT:    s_lshr_b32 s11, s8, 16
-; SI-NEXT:    s_lshr_b32 s10, s8, 8
-; SI-NEXT:    s_lshr_b32 s15, s9, 24
-; SI-NEXT:    s_lshr_b32 s14, s9, 16
-; SI-NEXT:    s_lshr_b32 s13, s9, 8
+; SI-NEXT:    s_and_b32 s5, s18, 0xffff
+; SI-NEXT:    s_lshl_b32 s8, s19, 16
+; SI-NEXT:    s_or_b32 s5, s8, s5
+; SI-NEXT:    s_add_i32 s4, s4, 0x30000
+; SI-NEXT:    s_add_i32 s5, s5, 0x30000
+; SI-NEXT:    s_lshr_b64 s[8:9], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[10:11], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[4:5], 8
+; SI-NEXT:    s_lshr_b64 s[14:15], s[6:7], 24
+; SI-NEXT:    s_lshr_b64 s[24:25], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[26:27], s[6:7], 8
+; SI-NEXT:    s_lshr_b32 s13, s5, 24
+; SI-NEXT:    s_lshr_b32 s11, s5, 16
+; SI-NEXT:    s_lshr_b32 s9, s5, 8
+; SI-NEXT:    s_lshr_b32 s27, s7, 24
+; SI-NEXT:    s_lshr_b32 s25, s7, 16
+; SI-NEXT:    s_lshr_b32 s15, s7, 8
 ; SI-NEXT:  .LBB97_3: ; %end
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_mov_b32_e32 v4, s8
-; SI-NEXT:    v_mov_b32_e32 v5, s10
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s12
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    v_mov_b32_e32 v3, s8
+; SI-NEXT:    v_mov_b32_e32 v4, s5
+; SI-NEXT:    v_mov_b32_e32 v5, s9
 ; SI-NEXT:    v_mov_b32_e32 v6, s11
-; SI-NEXT:    v_mov_b32_e32 v7, s12
-; SI-NEXT:    v_mov_b32_e32 v8, s7
-; SI-NEXT:    v_mov_b32_e32 v12, s9
-; SI-NEXT:    v_mov_b32_e32 v13, s13
-; SI-NEXT:    v_mov_b32_e32 v14, s14
-; SI-NEXT:    v_mov_b32_e32 v15, s15
+; SI-NEXT:    v_mov_b32_e32 v7, s13
+; SI-NEXT:    v_mov_b32_e32 v8, s6
+; SI-NEXT:    v_mov_b32_e32 v9, s26
+; SI-NEXT:    v_mov_b32_e32 v10, s24
+; SI-NEXT:    v_mov_b32_e32 v11, s14
+; SI-NEXT:    v_mov_b32_e32 v12, s7
+; SI-NEXT:    v_mov_b32_e32 v13, s15
+; SI-NEXT:    v_mov_b32_e32 v14, s25
+; SI-NEXT:    v_mov_b32_e32 v15, s27
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB97_4:
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $sgpr11
+; SI-NEXT:    ; implicit-def: $sgpr4
 ; SI-NEXT:    ; implicit-def: $sgpr12
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr8
 ; SI-NEXT:    ; implicit-def: $sgpr9
+; SI-NEXT:    ; implicit-def: $sgpr11
 ; SI-NEXT:    ; implicit-def: $sgpr13
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr26
+; SI-NEXT:    ; implicit-def: $sgpr24
 ; SI-NEXT:    ; implicit-def: $sgpr14
 ; SI-NEXT:    ; implicit-def: $sgpr15
+; SI-NEXT:    ; implicit-def: $sgpr25
+; SI-NEXT:    ; implicit-def: $sgpr27
 ; SI-NEXT:    s_branch .LBB97_2
 ;
 ; VI-LABEL: bitcast_v8i16_to_v16i8_scalar:
@@ -20067,53 +20093,53 @@ define inreg <8 x i16> @bitcast_v16i8_to_v8i16_scalar(<16 x i8> inreg %a, i32 in
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT:    v_readfirstlane_b32 s6, v1
+; SI-NEXT:    v_readfirstlane_b32 s14, v1
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_readfirstlane_b32 s8, v0
+; SI-NEXT:    v_readfirstlane_b32 s15, v0
 ; SI-NEXT:    s_cbranch_scc0 .LBB99_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_and_b32 s4, s20, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s21, 8
+; SI-NEXT:    s_and_b32 s4, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s17, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s22, 0xff
+; SI-NEXT:    s_and_b32 s5, s18, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s7, s23, 24
+; SI-NEXT:    s_lshl_b32 s6, s19, 24
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s7, s5
-; SI-NEXT:    s_or_b32 s10, s4, s5
-; SI-NEXT:    s_and_b32 s4, s18, 0xff
-; SI-NEXT:    s_lshl_b32 s4, s4, 16
-; SI-NEXT:    s_lshl_b32 s7, s19, 24
-; SI-NEXT:    s_or_b32 s4, s7, s4
-; SI-NEXT:    s_and_b32 s7, s28, 0xff
+; SI-NEXT:    s_or_b32 s40, s6, s5
+; SI-NEXT:    s_or_b32 s6, s4, s40
+; SI-NEXT:    s_and_b32 s4, s24, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s25, 8
+; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    s_and_b32 s5, s26, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    s_lshl_b32 s7, s27, 24
+; SI-NEXT:    s_or_b32 s42, s7, s5
+; SI-NEXT:    s_and_b32 s5, s20, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s21, 8
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    s_and_b32 s7, s22, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_lshl_b32 s8, s23, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s7, s8, s7
+; SI-NEXT:    s_or_b32 s41, s5, s7
+; SI-NEXT:    s_lshr_b64 s[8:9], s[40:41], 16
+; SI-NEXT:    s_and_b32 s5, s28, 0xff
 ; SI-NEXT:    s_lshl_b32 s9, s29, 8
-; SI-NEXT:    s_or_b32 s7, s7, s9
-; SI-NEXT:    s_and_b32 s9, s8, 0xff
-; SI-NEXT:    s_lshl_b32 s9, s9, 16
-; SI-NEXT:    s_lshl_b32 s11, s6, 24
-; SI-NEXT:    s_or_b32 s13, s11, s9
-; SI-NEXT:    s_and_b32 s9, s26, 0xff
+; SI-NEXT:    s_or_b32 s5, s5, s9
+; SI-NEXT:    s_and_b32 s9, s15, 0xff
 ; SI-NEXT:    s_lshl_b32 s9, s9, 16
-; SI-NEXT:    s_lshl_b32 s11, s27, 24
-; SI-NEXT:    s_or_b32 s9, s11, s9
-; SI-NEXT:    s_and_b32 s11, s16, 0xff
-; SI-NEXT:    s_lshl_b32 s12, s17, 8
-; SI-NEXT:    s_or_b32 s11, s11, s12
-; SI-NEXT:    s_and_b32 s11, s11, 0xffff
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    s_or_b32 s11, s11, s4
-; SI-NEXT:    s_and_b32 s4, s24, 0xff
-; SI-NEXT:    s_lshl_b32 s12, s25, 8
-; SI-NEXT:    s_and_b32 s7, s7, 0xffff
-; SI-NEXT:    s_or_b32 s4, s4, s12
-; SI-NEXT:    v_alignbit_b32 v1, s10, v0, 16
-; SI-NEXT:    s_or_b32 s7, s7, s13
-; SI-NEXT:    v_mov_b32_e32 v0, s9
+; SI-NEXT:    s_lshl_b32 s10, s14, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s12, s10, s9
+; SI-NEXT:    s_or_b32 s43, s5, s12
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_alignbit_b32 v5, s7, v0, 16
-; SI-NEXT:    s_or_b32 s9, s4, s9
-; SI-NEXT:    s_lshr_b32 s12, s5, 16
-; SI-NEXT:    s_lshr_b32 s13, s13, 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[42:43], 16
+; SI-NEXT:    s_or_b32 s4, s4, s42
+; SI-NEXT:    s_lshr_b32 s9, s7, 16
+; SI-NEXT:    s_lshr_b32 s11, s12, 16
+; SI-NEXT:    s_mov_b32 s7, s41
+; SI-NEXT:    s_mov_b32 s5, s43
 ; SI-NEXT:    s_cbranch_execnz .LBB99_3
 ; SI-NEXT:  .LBB99_2: ; %cmp.true
 ; SI-NEXT:    s_add_i32 s24, s24, 3
@@ -20121,76 +20147,74 @@ define inreg <8 x i16> @bitcast_v16i8_to_v8i16_scalar(<16 x i8> inreg %a, i32 in
 ; SI-NEXT:    s_lshl_b32 s5, s25, 8
 ; SI-NEXT:    s_add_i32 s26, s26, 3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_and_b32 s7, s26, 0xff
+; SI-NEXT:    s_and_b32 s6, s26, 0xff
 ; SI-NEXT:    s_addk_i32 s4, 0x300
 ; SI-NEXT:    s_lshl_b32 s5, s27, 24
-; SI-NEXT:    s_lshl_b32 s7, s7, 16
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s7
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s28, s28, 3
-; SI-NEXT:    s_add_i32 s9, s4, 0x3000000
-; SI-NEXT:    s_and_b32 s4, s28, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s29, 8
-; SI-NEXT:    s_add_i32 s8, s8, 3
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_lshl_b32 s5, s6, 24
-; SI-NEXT:    s_and_b32 s6, s8, 0xff
-; SI-NEXT:    s_addk_i32 s4, 0x300
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s5, s6
+; SI-NEXT:    s_add_i32 s28, s28, 3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    s_and_b32 s5, s28, 0xff
+; SI-NEXT:    s_lshl_b32 s6, s29, 8
+; SI-NEXT:    s_add_i32 s15, s15, 3
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    s_and_b32 s7, s15, 0xff
+; SI-NEXT:    s_addk_i32 s5, 0x300
+; SI-NEXT:    s_lshl_b32 s6, s14, 24
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s6, s6, s7
 ; SI-NEXT:    s_add_i32 s16, s16, 3
-; SI-NEXT:    s_add_i32 s7, s4, 0x3000000
-; SI-NEXT:    s_and_b32 s4, s16, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s17, 8
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    s_and_b32 s6, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s17, 8
 ; SI-NEXT:    s_add_i32 s18, s18, 3
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_and_b32 s6, s18, 0xff
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s19, 24
-; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s6
-; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    s_or_b32 s6, s7, s6
+; SI-NEXT:    s_and_b32 s8, s18, 0xff
+; SI-NEXT:    s_addk_i32 s6, 0x300
+; SI-NEXT:    s_lshl_b32 s7, s19, 24
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_or_b32 s7, s7, s8
 ; SI-NEXT:    s_add_i32 s20, s20, 3
-; SI-NEXT:    s_add_i32 s11, s4, 0x3000000
-; SI-NEXT:    s_and_b32 s4, s20, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s21, 8
+; SI-NEXT:    s_or_b32 s6, s7, s6
+; SI-NEXT:    s_and_b32 s7, s20, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s21, 8
 ; SI-NEXT:    s_add_i32 s22, s22, 3
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_and_b32 s6, s22, 0xff
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s23, 24
-; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s6
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s10, s4, 0x3000000
-; SI-NEXT:    v_mov_b32_e32 v0, s11
-; SI-NEXT:    v_alignbit_b32 v1, s10, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s9
-; SI-NEXT:    v_alignbit_b32 v5, s7, v0, 16
-; SI-NEXT:    s_lshr_b32 s12, s10, 16
-; SI-NEXT:    s_lshr_b32 s13, s7, 16
+; SI-NEXT:    s_or_b32 s7, s8, s7
+; SI-NEXT:    s_and_b32 s9, s22, 0xff
+; SI-NEXT:    s_addk_i32 s7, 0x300
+; SI-NEXT:    s_lshl_b32 s8, s23, 24
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    s_or_b32 s7, s8, s7
+; SI-NEXT:    s_add_i32 s4, s4, 0x3000000
+; SI-NEXT:    s_add_i32 s5, s5, 0x3000000
+; SI-NEXT:    s_add_i32 s6, s6, 0x3000000
+; SI-NEXT:    s_add_i32 s7, s7, 0x3000000
+; SI-NEXT:    s_lshr_b64 s[8:9], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[4:5], 16
+; SI-NEXT:    s_lshr_b32 s9, s7, 16
+; SI-NEXT:    s_lshr_b32 s11, s5, 16
 ; SI-NEXT:  .LBB99_3: ; %end
-; SI-NEXT:    v_mov_b32_e32 v0, s11
-; SI-NEXT:    v_mov_b32_e32 v2, s10
-; SI-NEXT:    v_mov_b32_e32 v3, s12
-; SI-NEXT:    v_mov_b32_e32 v4, s9
-; SI-NEXT:    v_mov_b32_e32 v6, s7
-; SI-NEXT:    v_mov_b32_e32 v7, s13
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    v_mov_b32_e32 v3, s9
+; SI-NEXT:    v_mov_b32_e32 v4, s4
+; SI-NEXT:    v_mov_b32_e32 v5, s10
+; SI-NEXT:    v_mov_b32_e32 v6, s5
+; SI-NEXT:    v_mov_b32_e32 v7, s11
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB99_4:
-; SI-NEXT:    ; implicit-def: $sgpr11
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr8
 ; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $sgpr13
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr11
 ; SI-NEXT:    s_branch .LBB99_2
 ;
 ; VI-LABEL: bitcast_v16i8_to_v8i16_scalar:
@@ -22076,41 +22100,41 @@ define inreg <16 x i8> @bitcast_v8f16_to_v16i8_scalar(<8 x half> inreg %a, i32 i
 ; SI-LABEL: bitcast_v8f16_to_v16i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, s17
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, s16
+; SI-NEXT:    v_cvt_f16_f32_e32 v23, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, s16
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s19
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, s18
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, s21
-; SI-NEXT:    v_cvt_f16_f32_e32 v20, s20
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v26, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v25, s20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, s23
-; SI-NEXT:    v_cvt_f16_f32_e32 v19, s22
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, s22
 ; SI-NEXT:    s_cmp_lg_u32 s24, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB105_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v21
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; SI-NEXT:    v_or_b32_e32 v8, v20, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v14
-; SI-NEXT:    v_or_b32_e32 v0, v17, v0
-; SI-NEXT:    v_or_b32_e32 v4, v16, v1
-; SI-NEXT:    v_or_b32_e32 v12, v19, v5
-; SI-NEXT:    v_alignbit_b32 v3, v4, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, v4, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, v4, v0, 8
-; SI-NEXT:    v_alignbit_b32 v11, v12, v8, 24
-; SI-NEXT:    v_alignbit_b32 v10, v12, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, v12, v8, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v23
+; SI-NEXT:    v_or_b32_e32 v19, v16, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; SI-NEXT:    v_or_b32_e32 v20, v8, v0
+; SI-NEXT:    v_lshr_b64 v[0:1], v[19:20], 16
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v26
+; SI-NEXT:    v_or_b32_e32 v17, v25, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v14
+; SI-NEXT:    v_or_b32_e32 v18, v24, v1
+; SI-NEXT:    v_lshr_b64 v[3:4], v[19:20], 24
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v18
 ; SI-NEXT:    v_bfe_u32 v7, v6, 8, 8
 ; SI-NEXT:    v_bfe_u32 v15, v14, 8, 8
+; SI-NEXT:    v_lshr_b64 v[1:2], v[19:20], 8
+; SI-NEXT:    v_lshr_b64 v[11:12], v[17:18], 24
+; SI-NEXT:    v_lshr_b64 v[21:22], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[9:10], v[17:18], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB105_3
 ; SI-NEXT:  .LBB105_2: ; %cmp.true
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v21
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v20
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v26
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v25
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v14
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v19
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v24
 ; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
@@ -22120,13 +22144,13 @@ define inreg <16 x i8> @bitcast_v8f16_to_v16i8_scalar(<8 x half> inreg %a, i32 i
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_or_b32_e32 v8, v1, v0
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v18
+; SI-NEXT:    v_or_b32_e32 v17, v1, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v23
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v14
-; SI-NEXT:    v_or_b32_e32 v12, v2, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v17
+; SI-NEXT:    v_or_b32_e32 v18, v2, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v16
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v8
 ; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
@@ -22136,34 +22160,38 @@ define inreg <16 x i8> @bitcast_v8f16_to_v16i8_scalar(<8 x half> inreg %a, i32 i
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; SI-NEXT:    v_or_b32_e32 v4, v2, v1
-; SI-NEXT:    v_alignbit_b32 v3, v4, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, v4, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, v4, v0, 8
-; SI-NEXT:    v_alignbit_b32 v11, v12, v8, 24
-; SI-NEXT:    v_alignbit_b32 v10, v12, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, v12, v8, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
+; SI-NEXT:    v_or_b32_e32 v19, v1, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; SI-NEXT:    v_or_b32_e32 v20, v2, v0
+; SI-NEXT:    v_lshr_b64 v[0:1], v[19:20], 16
+; SI-NEXT:    v_lshr_b64 v[3:4], v[19:20], 24
+; SI-NEXT:    v_lshr_b64 v[1:2], v[19:20], 8
+; SI-NEXT:    v_lshr_b64 v[11:12], v[17:18], 24
+; SI-NEXT:    v_lshr_b64 v[21:22], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[9:10], v[17:18], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v18
 ; SI-NEXT:    v_bfe_u32 v7, v6, 8, 8
 ; SI-NEXT:    v_bfe_u32 v15, v14, 8, 8
 ; SI-NEXT:  .LBB105_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    v_mov_b32_e32 v0, v19
+; SI-NEXT:    v_mov_b32_e32 v4, v20
+; SI-NEXT:    v_mov_b32_e32 v8, v17
+; SI-NEXT:    v_mov_b32_e32 v10, v21
+; SI-NEXT:    v_mov_b32_e32 v12, v18
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB105_4:
-; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr19
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr0
 ; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr8
+; SI-NEXT:    ; implicit-def: $vgpr17
 ; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr10
+; SI-NEXT:    ; implicit-def: $vgpr21
 ; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr12
 ; SI-NEXT:    ; implicit-def: $vgpr13
 ; SI-NEXT:    ; implicit-def: $vgpr15
 ; SI-NEXT:    s_branch .LBB105_2
@@ -24073,89 +24101,94 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s24, 0
-; SI-NEXT:    v_mul_f32_e64 v18, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v19, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v16, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v17, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v22, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v23, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v20, 1.0, s23
-; SI-NEXT:    v_mul_f32_e64 v21, 1.0, s22
+; SI-NEXT:    v_mul_f32_e64 v9, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v16, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v25, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v26, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v23, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v24, 1.0, s22
 ; SI-NEXT:    s_cbranch_scc0 .LBB109_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v20
-; SI-NEXT:    v_alignbit_b32 v0, v0, v19, 16
-; SI-NEXT:    v_alignbit_b32 v4, v6, v17, 16
-; SI-NEXT:    v_alignbit_b32 v8, v5, v23, 16
-; SI-NEXT:    v_alignbit_b32 v12, v14, v21, 16
-; SI-NEXT:    v_alignbit_b32 v3, v4, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, v4, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, v4, v0, 8
-; SI-NEXT:    v_alignbit_b32 v11, v12, v8, 24
-; SI-NEXT:    v_alignbit_b32 v10, v12, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, v12, v8, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; SI-NEXT:    v_alignbit_b32 v19, v1, v16, 16
+; SI-NEXT:    v_alignbit_b32 v20, v6, v8, 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[19:20], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v25
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v23
+; SI-NEXT:    v_lshr_b64 v[3:4], v[19:20], 24
+; SI-NEXT:    v_alignbit_b32 v21, v2, v26, 16
+; SI-NEXT:    v_alignbit_b32 v22, v14, v24, 16
+; SI-NEXT:    v_lshr_b64 v[4:5], v[19:20], 16
+; SI-NEXT:    v_lshr_b64 v[10:11], v[21:22], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v22
+; SI-NEXT:    v_lshr_b64 v[17:18], v[21:22], 24
+; SI-NEXT:    v_lshr_b64 v[11:12], v[21:22], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB109_3
 ; SI-NEXT:  .LBB109_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v22
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v23
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v25
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v26
+; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_alignbit_b32 v21, v2, v1, 16
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v23
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v24
+; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_alignbit_b32 v8, v1, v0, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v20
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v21
-; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v1
-; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v15
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v18
-; SI-NEXT:    v_alignbit_b32 v12, v14, v0, 16
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v19
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v9
+; SI-NEXT:    v_alignbit_b32 v22, v14, v1, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
+; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; SI-NEXT:    v_alignbit_b32 v19, v2, v1, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v8
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v16
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v17
-; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
-; SI-NEXT:    v_alignbit_b32 v4, v6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v3, v4, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, v4, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, v4, v0, 8
-; SI-NEXT:    v_alignbit_b32 v11, v12, v8, 24
-; SI-NEXT:    v_alignbit_b32 v10, v12, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, v12, v8, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; SI-NEXT:    v_alignbit_b32 v20, v6, v1, 16
+; SI-NEXT:    v_lshr_b64 v[3:4], v[19:20], 24
+; SI-NEXT:    v_lshr_b64 v[10:11], v[21:22], 16
+; SI-NEXT:    v_lshr_b64 v[4:5], v[19:20], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[19:20], 8
+; SI-NEXT:    v_lshr_b64 v[17:18], v[21:22], 24
+; SI-NEXT:    v_lshr_b64 v[11:12], v[21:22], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v15
 ; SI-NEXT:  .LBB109_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v0, v19
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:    v_mov_b32_e32 v4, v20
+; SI-NEXT:    v_mov_b32_e32 v8, v21
+; SI-NEXT:    v_mov_b32_e32 v9, v11
+; SI-NEXT:    v_mov_b32_e32 v11, v17
+; SI-NEXT:    v_mov_b32_e32 v12, v22
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB109_4:
-; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr19
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr10
+; SI-NEXT:    ; implicit-def: $vgpr21
 ; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr12
+; SI-NEXT:    ; implicit-def: $vgpr10
 ; SI-NEXT:    ; implicit-def: $vgpr13
 ; SI-NEXT:    ; implicit-def: $vgpr14
 ; SI-NEXT:    ; implicit-def: $vgpr15
+; SI-NEXT:    ; implicit-def: $vgpr17
 ; SI-NEXT:    s_branch .LBB109_2
 ;
 ; VI-LABEL: bitcast_v8bf16_to_v16i8_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll
index c87d52c1e6907..ee209f84efe7c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll
@@ -520,44 +520,41 @@ define inreg <10 x i16> @bitcast_v5i32_to_v10i16_scalar(<5 x i32> inreg %a, i32
 ; SI-NEXT:    s_cmp_lg_u32 s21, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB5_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s4, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s19, 16
-; SI-NEXT:    s_lshr_b32 s7, s17, 16
+; SI-NEXT:    s_lshr_b32 s12, s19, 16
+; SI-NEXT:    s_lshr_b32 s13, s17, 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB5_3
 ; SI-NEXT:  .LBB5_2: ; %cmp.true
-; SI-NEXT:    s_add_i32 s18, s18, 3
-; SI-NEXT:    s_add_i32 s16, s16, 3
-; SI-NEXT:    s_add_i32 s19, s19, 3
-; SI-NEXT:    v_mov_b32_e32 v0, s18
 ; SI-NEXT:    s_add_i32 s20, s20, 3
 ; SI-NEXT:    s_add_i32 s17, s17, 3
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s4, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s19, 16
-; SI-NEXT:    s_lshr_b32 s7, s17, 16
+; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    s_add_i32 s19, s19, 3
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_lshr_b64 s[4:5], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[16:17], 16
+; SI-NEXT:    s_lshr_b32 s12, s19, 16
+; SI-NEXT:    s_lshr_b32 s13, s17, 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[20:21], 16
 ; SI-NEXT:  .LBB5_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
+; SI-NEXT:    v_mov_b32_e32 v1, s8
 ; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v3, s7
+; SI-NEXT:    v_mov_b32_e32 v3, s13
 ; SI-NEXT:    v_mov_b32_e32 v4, s18
+; SI-NEXT:    v_mov_b32_e32 v5, s4
 ; SI-NEXT:    v_mov_b32_e32 v6, s19
-; SI-NEXT:    v_mov_b32_e32 v7, s6
+; SI-NEXT:    v_mov_b32_e32 v7, s12
 ; SI-NEXT:    v_mov_b32_e32 v8, s20
+; SI-NEXT:    v_mov_b32_e32 v9, s6
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB5_4:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr13
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr12
 ; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr9
 ; SI-NEXT:    s_branch .LBB5_2
 ;
 ; VI-LABEL: bitcast_v5i32_to_v10i16_scalar:
@@ -1731,42 +1728,47 @@ define inreg <10 x i16> @bitcast_v5f32_to_v10i16_scalar(<5 x float> inreg %a, i3
 ; SI-NEXT:    s_cmp_lg_u32 s21, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s4, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s7, s19, 16
-; SI-NEXT:    s_lshr_b32 s6, s17, 16
+; SI-NEXT:    s_lshr_b32 s13, s19, 16
+; SI-NEXT:    s_lshr_b32 s12, s17, 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB13_4
 ; SI-NEXT:  .LBB13_2: ; %cmp.true
 ; SI-NEXT:    v_add_f32_e64 v8, s20, 1.0
-; SI-NEXT:    v_add_f32_e64 v2, s17, 1.0
-; SI-NEXT:    v_add_f32_e64 v0, s16, 1.0
-; SI-NEXT:    v_add_f32_e64 v6, s19, 1.0
-; SI-NEXT:    v_add_f32_e64 v4, s18, 1.0
-; SI-NEXT:    v_alignbit_b32 v5, v6, v4, 16
-; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; SI-NEXT:    v_alignbit_b32 v9, s4, v8, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-NEXT:    s_setpc_b64 s[30:31]
+; SI-NEXT:    v_add_f32_e64 v14, s17, 1.0
+; SI-NEXT:    v_add_f32_e64 v13, s16, 1.0
+; SI-NEXT:    v_add_f32_e64 v12, s19, 1.0
+; SI-NEXT:    v_add_f32_e64 v11, s18, 1.0
+; SI-NEXT:    v_lshr_b64 v[5:6], v[11:12], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[9:10], v[8:9], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v14
+; SI-NEXT:    s_branch .LBB13_5
 ; SI-NEXT:  .LBB13_3:
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr13
 ; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr9
 ; SI-NEXT:    s_branch .LBB13_2
 ; SI-NEXT:  .LBB13_4:
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v4, s18
-; SI-NEXT:    v_mov_b32_e32 v6, s19
+; SI-NEXT:    v_mov_b32_e32 v13, s16
+; SI-NEXT:    v_mov_b32_e32 v14, s17
+; SI-NEXT:    v_mov_b32_e32 v11, s18
+; SI-NEXT:    v_mov_b32_e32 v12, s19
 ; SI-NEXT:    v_mov_b32_e32 v8, s20
-; SI-NEXT:    v_mov_b32_e32 v3, s6
-; SI-NEXT:    v_mov_b32_e32 v7, s7
+; SI-NEXT:    v_mov_b32_e32 v3, s12
+; SI-NEXT:    v_mov_b32_e32 v7, s13
+; SI-NEXT:    v_mov_b32_e32 v9, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s8
+; SI-NEXT:    v_mov_b32_e32 v5, s4
+; SI-NEXT:  .LBB13_5: ; %end
+; SI-NEXT:    v_mov_b32_e32 v0, v13
+; SI-NEXT:    v_mov_b32_e32 v2, v14
+; SI-NEXT:    v_mov_b32_e32 v4, v11
+; SI-NEXT:    v_mov_b32_e32 v6, v12
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v5f32_to_v10i16_scalar:
@@ -3319,11 +3321,11 @@ define inreg <10 x i16> @bitcast_v10f16_to_v10i16_scalar(<10 x half> inreg %a, i
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, s16
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, s17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, s18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, s19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, s21
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s24
@@ -3333,49 +3335,51 @@ define inreg <10 x i16> @bitcast_v10f16_to_v10i16_scalar(<10 x half> inreg %a, i
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_cbranch_execnz .LBB23_3
 ; SI-NEXT:  .LBB23_2: ; %cmp.true
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v10
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
-; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
+; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
 ; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
-; SI-NEXT:    v_or_b32_e32 v8, v8, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v10
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_lshr_b64 v[10:11], v[1:2], 16
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v9
+; SI-NEXT:    v_lshr_b64 v[11:12], v[5:6], 16
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v5
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v5, 16
+; SI-NEXT:    v_or_b32_e32 v8, v8, v13
 ; SI-NEXT:  .LBB23_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v1, v10
+; SI-NEXT:    v_mov_b32_e32 v5, v11
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB23_4:
 ; SI-NEXT:    s_branch .LBB23_2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll
index c3ace0ac5af71..57eae8600dc4a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll
@@ -1360,50 +1360,47 @@ define inreg <12 x i16> @bitcast_v6i32_to_v12i16_scalar(<6 x i32> inreg %a, i32
 ; SI-NEXT:    s_cmp_lg_u32 s22, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s21, 16
-; SI-NEXT:    s_lshr_b32 s7, s19, 16
-; SI-NEXT:    s_lshr_b32 s8, s17, 16
+; SI-NEXT:    s_lshr_b32 s12, s21, 16
+; SI-NEXT:    s_lshr_b32 s13, s19, 16
+; SI-NEXT:    s_lshr_b32 s14, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB13_3
 ; SI-NEXT:  .LBB13_2: ; %cmp.true
-; SI-NEXT:    s_add_i32 s20, s20, 3
-; SI-NEXT:    s_add_i32 s18, s18, 3
-; SI-NEXT:    s_add_i32 s21, s21, 3
-; SI-NEXT:    v_mov_b32_e32 v0, s20
+; SI-NEXT:    s_add_i32 s17, s17, 3
 ; SI-NEXT:    s_add_i32 s16, s16, 3
 ; SI-NEXT:    s_add_i32 s19, s19, 3
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    s_add_i32 s17, s17, 3
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s21, 16
-; SI-NEXT:    s_lshr_b32 s7, s19, 16
-; SI-NEXT:    s_lshr_b32 s8, s17, 16
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_add_i32 s21, s21, 3
+; SI-NEXT:    s_add_i32 s20, s20, 3
+; SI-NEXT:    s_lshr_b64 s[4:5], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[16:17], 16
+; SI-NEXT:    s_lshr_b32 s12, s21, 16
+; SI-NEXT:    s_lshr_b32 s13, s19, 16
+; SI-NEXT:    s_lshr_b32 s14, s17, 16
 ; SI-NEXT:  .LBB13_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
+; SI-NEXT:    v_mov_b32_e32 v1, s8
 ; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v3, s8
+; SI-NEXT:    v_mov_b32_e32 v3, s14
 ; SI-NEXT:    v_mov_b32_e32 v4, s18
+; SI-NEXT:    v_mov_b32_e32 v5, s6
 ; SI-NEXT:    v_mov_b32_e32 v6, s19
-; SI-NEXT:    v_mov_b32_e32 v7, s7
+; SI-NEXT:    v_mov_b32_e32 v7, s13
 ; SI-NEXT:    v_mov_b32_e32 v8, s20
+; SI-NEXT:    v_mov_b32_e32 v9, s4
 ; SI-NEXT:    v_mov_b32_e32 v10, s21
-; SI-NEXT:    v_mov_b32_e32 v11, s6
+; SI-NEXT:    v_mov_b32_e32 v11, s12
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB13_4:
-; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $sgpr14
 ; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr13
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr12
 ; SI-NEXT:    s_branch .LBB13_2
 ;
 ; VI-LABEL: bitcast_v6i32_to_v12i16_scalar:
@@ -3505,48 +3502,55 @@ define inreg <12 x i16> @bitcast_v6f32_to_v12i16_scalar(<6 x float> inreg %a, i3
 ; SI-NEXT:    s_cmp_lg_u32 s22, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB29_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s8, s21, 16
-; SI-NEXT:    s_lshr_b32 s7, s19, 16
-; SI-NEXT:    s_lshr_b32 s6, s17, 16
+; SI-NEXT:    s_lshr_b32 s14, s21, 16
+; SI-NEXT:    s_lshr_b32 s13, s19, 16
+; SI-NEXT:    s_lshr_b32 s12, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB29_4
 ; SI-NEXT:  .LBB29_2: ; %cmp.true
-; SI-NEXT:    v_add_f32_e64 v2, s17, 1.0
-; SI-NEXT:    v_add_f32_e64 v0, s16, 1.0
-; SI-NEXT:    v_add_f32_e64 v6, s19, 1.0
-; SI-NEXT:    v_add_f32_e64 v4, s18, 1.0
-; SI-NEXT:    v_add_f32_e64 v10, s21, 1.0
-; SI-NEXT:    v_add_f32_e64 v8, s20, 1.0
-; SI-NEXT:    v_alignbit_b32 v9, v10, v8, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v4, 16
-; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-NEXT:    s_setpc_b64 s[30:31]
+; SI-NEXT:    v_add_f32_e64 v17, s17, 1.0
+; SI-NEXT:    v_add_f32_e64 v16, s16, 1.0
+; SI-NEXT:    v_add_f32_e64 v15, s19, 1.0
+; SI-NEXT:    v_add_f32_e64 v14, s18, 1.0
+; SI-NEXT:    v_add_f32_e64 v13, s21, 1.0
+; SI-NEXT:    v_add_f32_e64 v12, s20, 1.0
+; SI-NEXT:    v_lshr_b64 v[9:10], v[12:13], 16
+; SI-NEXT:    v_lshr_b64 v[5:6], v[14:15], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[16:17], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v17
+; SI-NEXT:    s_branch .LBB29_5
 ; SI-NEXT:  .LBB29_3:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr9
 ; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr13
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr14
 ; SI-NEXT:    s_branch .LBB29_2
 ; SI-NEXT:  .LBB29_4:
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v4, s18
-; SI-NEXT:    v_mov_b32_e32 v6, s19
-; SI-NEXT:    v_mov_b32_e32 v8, s20
-; SI-NEXT:    v_mov_b32_e32 v10, s21
-; SI-NEXT:    v_mov_b32_e32 v3, s6
-; SI-NEXT:    v_mov_b32_e32 v7, s7
-; SI-NEXT:    v_mov_b32_e32 v11, s8
+; SI-NEXT:    v_mov_b32_e32 v16, s16
+; SI-NEXT:    v_mov_b32_e32 v17, s17
+; SI-NEXT:    v_mov_b32_e32 v14, s18
+; SI-NEXT:    v_mov_b32_e32 v15, s19
+; SI-NEXT:    v_mov_b32_e32 v12, s20
+; SI-NEXT:    v_mov_b32_e32 v13, s21
+; SI-NEXT:    v_mov_b32_e32 v3, s12
+; SI-NEXT:    v_mov_b32_e32 v7, s13
+; SI-NEXT:    v_mov_b32_e32 v11, s14
+; SI-NEXT:    v_mov_b32_e32 v1, s8
+; SI-NEXT:    v_mov_b32_e32 v5, s6
+; SI-NEXT:    v_mov_b32_e32 v9, s4
+; SI-NEXT:  .LBB29_5: ; %end
+; SI-NEXT:    v_mov_b32_e32 v0, v16
+; SI-NEXT:    v_mov_b32_e32 v2, v17
+; SI-NEXT:    v_mov_b32_e32 v4, v14
+; SI-NEXT:    v_mov_b32_e32 v6, v15
+; SI-NEXT:    v_mov_b32_e32 v8, v12
+; SI-NEXT:    v_mov_b32_e32 v10, v13
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v6f32_to_v12i16_scalar:
@@ -5249,50 +5253,47 @@ define inreg <12 x i16> @bitcast_v3i64_to_v12i16_scalar(<3 x i64> inreg %a, i32
 ; SI-NEXT:    s_cmp_lg_u32 s22, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB41_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s21, 16
-; SI-NEXT:    s_lshr_b32 s7, s19, 16
-; SI-NEXT:    s_lshr_b32 s8, s17, 16
+; SI-NEXT:    s_lshr_b32 s12, s21, 16
+; SI-NEXT:    s_lshr_b32 s13, s19, 16
+; SI-NEXT:    s_lshr_b32 s14, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB41_3
 ; SI-NEXT:  .LBB41_2: ; %cmp.true
-; SI-NEXT:    s_add_u32 s16, s16, 3
-; SI-NEXT:    s_addc_u32 s17, s17, 0
-; SI-NEXT:    s_add_u32 s18, s18, 3
-; SI-NEXT:    s_addc_u32 s19, s19, 0
 ; SI-NEXT:    s_add_u32 s20, s20, 3
 ; SI-NEXT:    s_addc_u32 s21, s21, 0
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s21, 16
-; SI-NEXT:    s_lshr_b32 s7, s19, 16
-; SI-NEXT:    s_lshr_b32 s8, s17, 16
+; SI-NEXT:    s_add_u32 s18, s18, 3
+; SI-NEXT:    s_addc_u32 s19, s19, 0
+; SI-NEXT:    s_add_u32 s16, s16, 3
+; SI-NEXT:    s_addc_u32 s17, s17, 0
+; SI-NEXT:    s_lshr_b32 s12, s21, 16
+; SI-NEXT:    s_lshr_b32 s13, s19, 16
+; SI-NEXT:    s_lshr_b32 s14, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[16:17], 16
 ; SI-NEXT:  .LBB41_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
+; SI-NEXT:    v_mov_b32_e32 v1, s8
 ; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v3, s8
+; SI-NEXT:    v_mov_b32_e32 v3, s14
 ; SI-NEXT:    v_mov_b32_e32 v4, s18
+; SI-NEXT:    v_mov_b32_e32 v5, s6
 ; SI-NEXT:    v_mov_b32_e32 v6, s19
-; SI-NEXT:    v_mov_b32_e32 v7, s7
+; SI-NEXT:    v_mov_b32_e32 v7, s13
 ; SI-NEXT:    v_mov_b32_e32 v8, s20
+; SI-NEXT:    v_mov_b32_e32 v9, s4
 ; SI-NEXT:    v_mov_b32_e32 v10, s21
-; SI-NEXT:    v_mov_b32_e32 v11, s6
+; SI-NEXT:    v_mov_b32_e32 v11, s12
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB41_4:
-; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $sgpr14
 ; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr13
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr12
 ; SI-NEXT:    s_branch .LBB41_2
 ;
 ; VI-LABEL: bitcast_v3i64_to_v12i16_scalar:
@@ -6578,45 +6579,45 @@ define inreg <12 x i16> @bitcast_v3f64_to_v12i16_scalar(<3 x double> inreg %a, i
 ; SI-NEXT:    s_cmp_lg_u32 s22, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB49_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s8, s21, 16
-; SI-NEXT:    s_lshr_b32 s7, s19, 16
-; SI-NEXT:    s_lshr_b32 s6, s17, 16
+; SI-NEXT:    s_lshr_b32 s14, s21, 16
+; SI-NEXT:    s_lshr_b32 s13, s19, 16
+; SI-NEXT:    s_lshr_b32 s12, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB49_4
 ; SI-NEXT:  .LBB49_2: ; %cmp.true
 ; SI-NEXT:    v_add_f64 v[16:17], s[16:17], 1.0
 ; SI-NEXT:    v_add_f64 v[12:13], s[20:21], 1.0
 ; SI-NEXT:    v_add_f64 v[14:15], s[18:19], 1.0
-; SI-NEXT:    v_alignbit_b32 v9, v13, v12, 16
-; SI-NEXT:    v_alignbit_b32 v5, v15, v14, 16
-; SI-NEXT:    v_alignbit_b32 v1, v17, v16, 16
+; SI-NEXT:    v_lshr_b64 v[9:10], v[12:13], 16
+; SI-NEXT:    v_lshr_b64 v[5:6], v[14:15], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[16:17], 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v13
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v15
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v17
 ; SI-NEXT:    s_branch .LBB49_5
 ; SI-NEXT:  .LBB49_3:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr9
 ; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr13
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr14
 ; SI-NEXT:    s_branch .LBB49_2
 ; SI-NEXT:  .LBB49_4:
-; SI-NEXT:    v_mov_b32_e32 v17, s17
 ; SI-NEXT:    v_mov_b32_e32 v16, s16
-; SI-NEXT:    v_mov_b32_e32 v15, s19
 ; SI-NEXT:    v_mov_b32_e32 v14, s18
-; SI-NEXT:    v_mov_b32_e32 v13, s21
 ; SI-NEXT:    v_mov_b32_e32 v12, s20
-; SI-NEXT:    v_mov_b32_e32 v3, s6
-; SI-NEXT:    v_mov_b32_e32 v7, s7
-; SI-NEXT:    v_mov_b32_e32 v11, s8
+; SI-NEXT:    v_mov_b32_e32 v17, s17
+; SI-NEXT:    v_mov_b32_e32 v15, s19
+; SI-NEXT:    v_mov_b32_e32 v13, s21
+; SI-NEXT:    v_mov_b32_e32 v3, s12
+; SI-NEXT:    v_mov_b32_e32 v7, s13
+; SI-NEXT:    v_mov_b32_e32 v11, s14
+; SI-NEXT:    v_mov_b32_e32 v1, s8
+; SI-NEXT:    v_mov_b32_e32 v5, s6
+; SI-NEXT:    v_mov_b32_e32 v9, s4
 ; SI-NEXT:  .LBB49_5: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, v16
 ; SI-NEXT:    v_mov_b32_e32 v2, v17
@@ -8296,15 +8297,15 @@ define inreg <12 x i16> @bitcast_v12f16_to_v12i16_scalar(<12 x half> inreg %a, i
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, s16
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, s17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, s18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, s19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, s21
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s24
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, s25
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, s25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, s27
 ; SI-NEXT:    s_cmp_lg_u32 s28, 0
@@ -8317,53 +8318,56 @@ define inreg <12 x i16> @bitcast_v12f16_to_v12i16_scalar(<12 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v14
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v12
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
 ; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
 ; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_or_b32_e32 v10, v10, v12
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
-; SI-NEXT:    v_or_b32_e32 v10, v10, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v7
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v2, v2, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_or_b32_e32 v2, v2, v12
+; SI-NEXT:    v_lshr_b64 v[14:15], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[15:16], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[12:13], v[9:10], 16
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v5
 ; SI-NEXT:    v_or_b32_e32 v8, v8, v9
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v9, v10, v9, 16
 ; SI-NEXT:  .LBB59_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v1, v14
+; SI-NEXT:    v_mov_b32_e32 v5, v15
+; SI-NEXT:    v_mov_b32_e32 v9, v12
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB59_4:
 ; SI-NEXT:    s_branch .LBB59_2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll
index c830d6b344b6f..7d0897bb2151b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll
@@ -585,57 +585,53 @@ define inreg <14 x i16> @bitcast_v7i32_to_v14i16_scalar(<7 x i32> inreg %a, i32
 ; SI-NEXT:    s_cmp_lg_u32 s23, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB5_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v13, s4, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s21, 16
-; SI-NEXT:    s_lshr_b32 s7, s19, 16
-; SI-NEXT:    s_lshr_b32 s8, s17, 16
+; SI-NEXT:    s_lshr_b32 s23, s17, 16
+; SI-NEXT:    s_lshr_b32 s14, s21, 16
+; SI-NEXT:    s_lshr_b32 s15, s19, 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB5_3
 ; SI-NEXT:  .LBB5_2: ; %cmp.true
-; SI-NEXT:    s_add_i32 s20, s20, 3
-; SI-NEXT:    s_add_i32 s18, s18, 3
-; SI-NEXT:    s_add_i32 s21, s21, 3
-; SI-NEXT:    v_mov_b32_e32 v0, s20
+; SI-NEXT:    s_add_i32 s17, s17, 3
+; SI-NEXT:    s_add_i32 s22, s22, 3
 ; SI-NEXT:    s_add_i32 s16, s16, 3
 ; SI-NEXT:    s_add_i32 s19, s19, 3
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    s_add_i32 s22, s22, 3
-; SI-NEXT:    s_add_i32 s17, s17, 3
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v13, s4, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s21, 16
-; SI-NEXT:    s_lshr_b32 s7, s19, 16
-; SI-NEXT:    s_lshr_b32 s8, s17, 16
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_add_i32 s21, s21, 3
+; SI-NEXT:    s_add_i32 s20, s20, 3
+; SI-NEXT:    s_lshr_b32 s23, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[16:17], 16
+; SI-NEXT:    s_lshr_b32 s14, s21, 16
+; SI-NEXT:    s_lshr_b32 s15, s19, 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[22:23], 16
 ; SI-NEXT:  .LBB5_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
+; SI-NEXT:    v_mov_b32_e32 v1, s10
 ; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v3, s8
+; SI-NEXT:    v_mov_b32_e32 v3, s23
 ; SI-NEXT:    v_mov_b32_e32 v4, s18
+; SI-NEXT:    v_mov_b32_e32 v5, s8
 ; SI-NEXT:    v_mov_b32_e32 v6, s19
-; SI-NEXT:    v_mov_b32_e32 v7, s7
+; SI-NEXT:    v_mov_b32_e32 v7, s15
 ; SI-NEXT:    v_mov_b32_e32 v8, s20
+; SI-NEXT:    v_mov_b32_e32 v9, s4
 ; SI-NEXT:    v_mov_b32_e32 v10, s21
-; SI-NEXT:    v_mov_b32_e32 v11, s6
+; SI-NEXT:    v_mov_b32_e32 v11, s14
 ; SI-NEXT:    v_mov_b32_e32 v12, s22
+; SI-NEXT:    v_mov_b32_e32 v13, s6
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB5_4:
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr23
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $sgpr15
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr14
 ; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr13
 ; SI-NEXT:    s_branch .LBB5_2
 ;
 ; VI-LABEL: bitcast_v7i32_to_v14i16_scalar:
@@ -2048,54 +2044,61 @@ define inreg <14 x i16> @bitcast_v7f32_to_v14i16_scalar(<7 x float> inreg %a, i3
 ; SI-NEXT:    s_cmp_lg_u32 s23, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v13, s4, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s8, s21, 16
-; SI-NEXT:    s_lshr_b32 s7, s19, 16
-; SI-NEXT:    s_lshr_b32 s6, s17, 16
+; SI-NEXT:    s_lshr_b32 s23, s21, 16
+; SI-NEXT:    s_lshr_b32 s15, s19, 16
+; SI-NEXT:    s_lshr_b32 s14, s17, 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB13_4
 ; SI-NEXT:  .LBB13_2: ; %cmp.true
 ; SI-NEXT:    v_add_f32_e64 v12, s22, 1.0
-; SI-NEXT:    v_add_f32_e64 v2, s17, 1.0
-; SI-NEXT:    v_add_f32_e64 v0, s16, 1.0
-; SI-NEXT:    v_add_f32_e64 v6, s19, 1.0
-; SI-NEXT:    v_add_f32_e64 v4, s18, 1.0
-; SI-NEXT:    v_add_f32_e64 v10, s21, 1.0
-; SI-NEXT:    v_add_f32_e64 v8, s20, 1.0
-; SI-NEXT:    v_alignbit_b32 v9, v10, v8, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v4, 16
-; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; SI-NEXT:    v_alignbit_b32 v13, s4, v12, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-NEXT:    s_setpc_b64 s[30:31]
+; SI-NEXT:    v_add_f32_e64 v20, s17, 1.0
+; SI-NEXT:    v_add_f32_e64 v19, s16, 1.0
+; SI-NEXT:    v_add_f32_e64 v18, s19, 1.0
+; SI-NEXT:    v_add_f32_e64 v17, s18, 1.0
+; SI-NEXT:    v_add_f32_e64 v16, s21, 1.0
+; SI-NEXT:    v_add_f32_e64 v15, s20, 1.0
+; SI-NEXT:    v_lshr_b64 v[9:10], v[15:16], 16
+; SI-NEXT:    v_lshr_b64 v[5:6], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[19:20], 16
+; SI-NEXT:    v_lshr_b64 v[13:14], v[12:13], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v20
+; SI-NEXT:    s_branch .LBB13_5
 ; SI-NEXT:  .LBB13_3:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr14
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $sgpr15
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr23
+; SI-NEXT:    ; implicit-def: $sgpr6
 ; SI-NEXT:    s_branch .LBB13_2
 ; SI-NEXT:  .LBB13_4:
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v4, s18
-; SI-NEXT:    v_mov_b32_e32 v6, s19
-; SI-NEXT:    v_mov_b32_e32 v8, s20
-; SI-NEXT:    v_mov_b32_e32 v10, s21
+; SI-NEXT:    v_mov_b32_e32 v19, s16
+; SI-NEXT:    v_mov_b32_e32 v20, s17
+; SI-NEXT:    v_mov_b32_e32 v17, s18
+; SI-NEXT:    v_mov_b32_e32 v18, s19
+; SI-NEXT:    v_mov_b32_e32 v15, s20
+; SI-NEXT:    v_mov_b32_e32 v16, s21
 ; SI-NEXT:    v_mov_b32_e32 v12, s22
-; SI-NEXT:    v_mov_b32_e32 v3, s6
-; SI-NEXT:    v_mov_b32_e32 v7, s7
-; SI-NEXT:    v_mov_b32_e32 v11, s8
+; SI-NEXT:    v_mov_b32_e32 v3, s14
+; SI-NEXT:    v_mov_b32_e32 v7, s15
+; SI-NEXT:    v_mov_b32_e32 v11, s23
+; SI-NEXT:    v_mov_b32_e32 v13, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s10
+; SI-NEXT:    v_mov_b32_e32 v5, s8
+; SI-NEXT:    v_mov_b32_e32 v9, s4
+; SI-NEXT:  .LBB13_5: ; %end
+; SI-NEXT:    v_mov_b32_e32 v0, v19
+; SI-NEXT:    v_mov_b32_e32 v2, v20
+; SI-NEXT:    v_mov_b32_e32 v4, v17
+; SI-NEXT:    v_mov_b32_e32 v6, v18
+; SI-NEXT:    v_mov_b32_e32 v8, v15
+; SI-NEXT:    v_mov_b32_e32 v10, v16
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v7f32_to_v14i16_scalar:
@@ -3965,22 +3968,21 @@ define inreg <14 x i16> @bitcast_v14f16_to_v14i16_scalar(<14 x half> inreg %a, i
 ; SI-LABEL: bitcast_v14f16_to_v14i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v14, v0
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, s16
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, s17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, s18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, s19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, s21
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s24
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, s25
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, s25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, s27
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, s28
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, s29
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_cbranch_scc0 .LBB23_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
@@ -3993,17 +3995,14 @@ define inreg <14 x i16> @bitcast_v14f16_to_v14i16_scalar(<14 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v14
 ; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v15
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v16
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
@@ -4012,40 +4011,46 @@ define inreg <14 x i16> @bitcast_v14f16_to_v14i16_scalar(<14 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
 ; SI-NEXT:    v_or_b32_e32 v12, v12, v14
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
 ; SI-NEXT:    v_or_b32_e32 v10, v10, v14
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
+; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v14
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT:    v_or_b32_e32 v2, v2, v14
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_lshr_b64 v[14:15], v[1:2], 16
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_or_b32_e32 v2, v2, v14
+; SI-NEXT:    v_lshr_b64 v[15:16], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[16:17], v[9:10], 16
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v5
 ; SI-NEXT:    v_or_b32_e32 v8, v8, v9
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v9, v10, v9, 16
 ; SI-NEXT:  .LBB23_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v1, v14
+; SI-NEXT:    v_mov_b32_e32 v5, v15
+; SI-NEXT:    v_mov_b32_e32 v9, v16
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB23_4:
 ; SI-NEXT:    s_branch .LBB23_2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
index 52e125d0d658f..cb4b3bd4382a4 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
@@ -1514,63 +1514,59 @@ define inreg <16 x i16> @bitcast_v8i32_to_v16i16_scalar(<8 x i32> inreg %a, i32
 ; SI-NEXT:    s_cmp_lg_u32 s24, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v13, s23, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s23, 16
-; SI-NEXT:    s_lshr_b32 s7, s21, 16
-; SI-NEXT:    s_lshr_b32 s8, s19, 16
-; SI-NEXT:    s_lshr_b32 s9, s17, 16
+; SI-NEXT:    s_lshr_b32 s14, s23, 16
+; SI-NEXT:    s_lshr_b32 s15, s21, 16
+; SI-NEXT:    s_lshr_b32 s24, s19, 16
+; SI-NEXT:    s_lshr_b32 s25, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB13_3
 ; SI-NEXT:  .LBB13_2: ; %cmp.true
-; SI-NEXT:    s_add_i32 s22, s22, 3
-; SI-NEXT:    s_add_i32 s20, s20, 3
-; SI-NEXT:    s_add_i32 s23, s23, 3
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    s_add_i32 s18, s18, 3
-; SI-NEXT:    s_add_i32 s21, s21, 3
-; SI-NEXT:    v_alignbit_b32 v13, s23, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
+; SI-NEXT:    s_add_i32 s17, s17, 3
 ; SI-NEXT:    s_add_i32 s16, s16, 3
 ; SI-NEXT:    s_add_i32 s19, s19, 3
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    s_add_i32 s17, s17, 3
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s23, 16
-; SI-NEXT:    s_lshr_b32 s7, s21, 16
-; SI-NEXT:    s_lshr_b32 s8, s19, 16
-; SI-NEXT:    s_lshr_b32 s9, s17, 16
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_add_i32 s21, s21, 3
+; SI-NEXT:    s_add_i32 s20, s20, 3
+; SI-NEXT:    s_add_i32 s23, s23, 3
+; SI-NEXT:    s_add_i32 s22, s22, 3
+; SI-NEXT:    s_lshr_b64 s[4:5], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[16:17], 16
+; SI-NEXT:    s_lshr_b32 s14, s23, 16
+; SI-NEXT:    s_lshr_b32 s15, s21, 16
+; SI-NEXT:    s_lshr_b32 s24, s19, 16
+; SI-NEXT:    s_lshr_b32 s25, s17, 16
 ; SI-NEXT:  .LBB13_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
+; SI-NEXT:    v_mov_b32_e32 v1, s10
 ; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v3, s9
+; SI-NEXT:    v_mov_b32_e32 v3, s25
 ; SI-NEXT:    v_mov_b32_e32 v4, s18
+; SI-NEXT:    v_mov_b32_e32 v5, s8
 ; SI-NEXT:    v_mov_b32_e32 v6, s19
-; SI-NEXT:    v_mov_b32_e32 v7, s8
+; SI-NEXT:    v_mov_b32_e32 v7, s24
 ; SI-NEXT:    v_mov_b32_e32 v8, s20
+; SI-NEXT:    v_mov_b32_e32 v9, s6
 ; SI-NEXT:    v_mov_b32_e32 v10, s21
-; SI-NEXT:    v_mov_b32_e32 v11, s7
+; SI-NEXT:    v_mov_b32_e32 v11, s15
 ; SI-NEXT:    v_mov_b32_e32 v12, s22
+; SI-NEXT:    v_mov_b32_e32 v13, s4
 ; SI-NEXT:    v_mov_b32_e32 v14, s23
-; SI-NEXT:    v_mov_b32_e32 v15, s6
+; SI-NEXT:    v_mov_b32_e32 v15, s14
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB13_4:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr25
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $sgpr24
 ; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr15
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr14
 ; SI-NEXT:    s_branch .LBB13_2
 ;
 ; VI-LABEL: bitcast_v8i32_to_v16i16_scalar:
@@ -5255,119 +5251,123 @@ define inreg <32 x i8> @bitcast_v8i32_to_v32i8_scalar(<8 x i32> inreg %a, i32 in
 ; SI-NEXT:    s_cmp_lg_u32 s24, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB25_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v27, s23, v0, 24
-; SI-NEXT:    v_alignbit_b32 v26, s23, v0, 16
-; SI-NEXT:    v_alignbit_b32 v25, s23, v0, 8
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v19, s21, v0, 24
-; SI-NEXT:    v_alignbit_b32 v18, s21, v0, 16
-; SI-NEXT:    v_alignbit_b32 v17, s21, v0, 8
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v11, s19, v0, 24
-; SI-NEXT:    v_alignbit_b32 v10, s19, v0, 16
-; SI-NEXT:    v_alignbit_b32 v9, s19, v0, 8
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v3, s17, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, s17, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 8
-; SI-NEXT:    s_lshr_b32 s6, s23, 24
-; SI-NEXT:    s_lshr_b32 s7, s23, 16
-; SI-NEXT:    s_lshr_b32 s9, s23, 8
-; SI-NEXT:    s_lshr_b32 s8, s21, 24
-; SI-NEXT:    s_lshr_b32 s10, s21, 16
-; SI-NEXT:    s_lshr_b32 s12, s21, 8
-; SI-NEXT:    s_lshr_b32 s11, s19, 24
-; SI-NEXT:    s_lshr_b32 s13, s19, 16
-; SI-NEXT:    s_lshr_b32 s14, s19, 8
-; SI-NEXT:    s_lshr_b32 s15, s17, 24
-; SI-NEXT:    s_lshr_b32 s24, s17, 16
-; SI-NEXT:    s_lshr_b32 s25, s17, 8
+; SI-NEXT:    s_lshr_b32 s56, s23, 24
+; SI-NEXT:    s_lshr_b32 s57, s23, 16
+; SI-NEXT:    s_lshr_b32 s58, s23, 8
+; SI-NEXT:    s_lshr_b32 s59, s21, 24
+; SI-NEXT:    s_lshr_b32 s60, s21, 16
+; SI-NEXT:    s_lshr_b32 s61, s21, 8
+; SI-NEXT:    s_lshr_b32 s62, s19, 24
+; SI-NEXT:    s_lshr_b32 s63, s19, 16
+; SI-NEXT:    s_lshr_b32 s72, s19, 8
+; SI-NEXT:    s_lshr_b32 s73, s17, 24
+; SI-NEXT:    s_lshr_b32 s74, s17, 16
+; SI-NEXT:    s_lshr_b32 s75, s17, 8
+; SI-NEXT:    s_lshr_b64 s[4:5], s[22:23], 24
+; SI-NEXT:    s_lshr_b64 s[6:7], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[22:23], 8
+; SI-NEXT:    s_lshr_b64 s[10:11], s[20:21], 24
+; SI-NEXT:    s_lshr_b64 s[12:13], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[20:21], 8
+; SI-NEXT:    s_lshr_b64 s[24:25], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[26:27], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[28:29], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[40:41], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[42:43], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[16:17], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB25_3
 ; SI-NEXT:  .LBB25_2: ; %cmp.true
-; SI-NEXT:    s_add_i32 s22, s22, 3
-; SI-NEXT:    s_add_i32 s20, s20, 3
-; SI-NEXT:    s_add_i32 s23, s23, 3
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    s_add_i32 s18, s18, 3
-; SI-NEXT:    s_add_i32 s21, s21, 3
-; SI-NEXT:    v_alignbit_b32 v27, s23, v0, 24
-; SI-NEXT:    v_alignbit_b32 v26, s23, v0, 16
-; SI-NEXT:    v_alignbit_b32 v25, s23, v0, 8
-; SI-NEXT:    v_mov_b32_e32 v0, s20
+; SI-NEXT:    s_add_i32 s17, s17, 3
 ; SI-NEXT:    s_add_i32 s16, s16, 3
 ; SI-NEXT:    s_add_i32 s19, s19, 3
-; SI-NEXT:    v_alignbit_b32 v19, s21, v0, 24
-; SI-NEXT:    v_alignbit_b32 v18, s21, v0, 16
-; SI-NEXT:    v_alignbit_b32 v17, s21, v0, 8
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    s_add_i32 s17, s17, 3
-; SI-NEXT:    v_alignbit_b32 v11, s19, v0, 24
-; SI-NEXT:    v_alignbit_b32 v10, s19, v0, 16
-; SI-NEXT:    v_alignbit_b32 v9, s19, v0, 8
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v3, s17, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, s17, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 8
-; SI-NEXT:    s_lshr_b32 s6, s23, 24
-; SI-NEXT:    s_lshr_b32 s7, s23, 16
-; SI-NEXT:    s_lshr_b32 s9, s23, 8
-; SI-NEXT:    s_lshr_b32 s8, s21, 24
-; SI-NEXT:    s_lshr_b32 s10, s21, 16
-; SI-NEXT:    s_lshr_b32 s12, s21, 8
-; SI-NEXT:    s_lshr_b32 s11, s19, 24
-; SI-NEXT:    s_lshr_b32 s13, s19, 16
-; SI-NEXT:    s_lshr_b32 s14, s19, 8
-; SI-NEXT:    s_lshr_b32 s15, s17, 24
-; SI-NEXT:    s_lshr_b32 s24, s17, 16
-; SI-NEXT:    s_lshr_b32 s25, s17, 8
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_add_i32 s21, s21, 3
+; SI-NEXT:    s_add_i32 s20, s20, 3
+; SI-NEXT:    s_add_i32 s23, s23, 3
+; SI-NEXT:    s_add_i32 s22, s22, 3
+; SI-NEXT:    s_lshr_b64 s[4:5], s[22:23], 24
+; SI-NEXT:    s_lshr_b64 s[6:7], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[22:23], 8
+; SI-NEXT:    s_lshr_b64 s[10:11], s[20:21], 24
+; SI-NEXT:    s_lshr_b64 s[12:13], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[20:21], 8
+; SI-NEXT:    s_lshr_b64 s[24:25], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[26:27], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[28:29], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[40:41], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[42:43], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[16:17], 8
+; SI-NEXT:    s_lshr_b32 s56, s23, 24
+; SI-NEXT:    s_lshr_b32 s57, s23, 16
+; SI-NEXT:    s_lshr_b32 s58, s23, 8
+; SI-NEXT:    s_lshr_b32 s59, s21, 24
+; SI-NEXT:    s_lshr_b32 s60, s21, 16
+; SI-NEXT:    s_lshr_b32 s61, s21, 8
+; SI-NEXT:    s_lshr_b32 s62, s19, 24
+; SI-NEXT:    s_lshr_b32 s63, s19, 16
+; SI-NEXT:    s_lshr_b32 s72, s19, 8
+; SI-NEXT:    s_lshr_b32 s73, s17, 24
+; SI-NEXT:    s_lshr_b32 s74, s17, 16
+; SI-NEXT:    s_lshr_b32 s75, s17, 8
 ; SI-NEXT:  .LBB25_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
+; SI-NEXT:    v_mov_b32_e32 v1, s44
+; SI-NEXT:    v_mov_b32_e32 v2, s42
+; SI-NEXT:    v_mov_b32_e32 v3, s40
 ; SI-NEXT:    v_mov_b32_e32 v4, s17
-; SI-NEXT:    v_mov_b32_e32 v5, s25
-; SI-NEXT:    v_mov_b32_e32 v6, s24
-; SI-NEXT:    v_mov_b32_e32 v7, s15
+; SI-NEXT:    v_mov_b32_e32 v5, s75
+; SI-NEXT:    v_mov_b32_e32 v6, s74
+; SI-NEXT:    v_mov_b32_e32 v7, s73
 ; SI-NEXT:    v_mov_b32_e32 v8, s18
+; SI-NEXT:    v_mov_b32_e32 v9, s28
+; SI-NEXT:    v_mov_b32_e32 v10, s26
+; SI-NEXT:    v_mov_b32_e32 v11, s24
 ; SI-NEXT:    v_mov_b32_e32 v12, s19
-; SI-NEXT:    v_mov_b32_e32 v13, s14
-; SI-NEXT:    v_mov_b32_e32 v14, s13
-; SI-NEXT:    v_mov_b32_e32 v15, s11
+; SI-NEXT:    v_mov_b32_e32 v13, s72
+; SI-NEXT:    v_mov_b32_e32 v14, s63
+; SI-NEXT:    v_mov_b32_e32 v15, s62
 ; SI-NEXT:    v_mov_b32_e32 v16, s20
+; SI-NEXT:    v_mov_b32_e32 v17, s14
+; SI-NEXT:    v_mov_b32_e32 v18, s12
+; SI-NEXT:    v_mov_b32_e32 v19, s10
 ; SI-NEXT:    v_mov_b32_e32 v20, s21
-; SI-NEXT:    v_mov_b32_e32 v21, s12
-; SI-NEXT:    v_mov_b32_e32 v22, s10
-; SI-NEXT:    v_mov_b32_e32 v23, s8
+; SI-NEXT:    v_mov_b32_e32 v21, s61
+; SI-NEXT:    v_mov_b32_e32 v22, s60
+; SI-NEXT:    v_mov_b32_e32 v23, s59
 ; SI-NEXT:    v_mov_b32_e32 v24, s22
+; SI-NEXT:    v_mov_b32_e32 v25, s8
+; SI-NEXT:    v_mov_b32_e32 v26, s6
+; SI-NEXT:    v_mov_b32_e32 v27, s4
 ; SI-NEXT:    v_mov_b32_e32 v28, s23
-; SI-NEXT:    v_mov_b32_e32 v29, s9
-; SI-NEXT:    v_mov_b32_e32 v30, s7
-; SI-NEXT:    v_mov_b32_e32 v31, s6
+; SI-NEXT:    v_mov_b32_e32 v29, s58
+; SI-NEXT:    v_mov_b32_e32 v30, s57
+; SI-NEXT:    v_mov_b32_e32 v31, s56
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB25_4:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $sgpr25
+; SI-NEXT:    ; implicit-def: $sgpr44
+; SI-NEXT:    ; implicit-def: $sgpr42
+; SI-NEXT:    ; implicit-def: $sgpr40
+; SI-NEXT:    ; implicit-def: $sgpr75
+; SI-NEXT:    ; implicit-def: $sgpr74
+; SI-NEXT:    ; implicit-def: $sgpr73
+; SI-NEXT:    ; implicit-def: $sgpr28
+; SI-NEXT:    ; implicit-def: $sgpr26
 ; SI-NEXT:    ; implicit-def: $sgpr24
-; SI-NEXT:    ; implicit-def: $sgpr15
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $sgpr72
+; SI-NEXT:    ; implicit-def: $sgpr63
+; SI-NEXT:    ; implicit-def: $sgpr62
 ; SI-NEXT:    ; implicit-def: $sgpr14
-; SI-NEXT:    ; implicit-def: $sgpr13
-; SI-NEXT:    ; implicit-def: $sgpr11
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr19
 ; SI-NEXT:    ; implicit-def: $sgpr12
 ; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr61
+; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr59
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $sgpr7
 ; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr58
+; SI-NEXT:    ; implicit-def: $sgpr57
+; SI-NEXT:    ; implicit-def: $sgpr56
 ; SI-NEXT:    s_branch .LBB25_2
 ;
 ; VI-LABEL: bitcast_v8i32_to_v32i8_scalar:
@@ -8503,60 +8503,69 @@ define inreg <16 x i16> @bitcast_v8f32_to_v16i16_scalar(<8 x float> inreg %a, i3
 ; SI-NEXT:    s_cmp_lg_u32 s24, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB37_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v13, s23, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s9, s23, 16
-; SI-NEXT:    s_lshr_b32 s8, s21, 16
-; SI-NEXT:    s_lshr_b32 s7, s19, 16
-; SI-NEXT:    s_lshr_b32 s6, s17, 16
+; SI-NEXT:    s_lshr_b32 s25, s23, 16
+; SI-NEXT:    s_lshr_b32 s24, s21, 16
+; SI-NEXT:    s_lshr_b32 s15, s19, 16
+; SI-NEXT:    s_lshr_b32 s14, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB37_4
 ; SI-NEXT:  .LBB37_2: ; %cmp.true
-; SI-NEXT:    v_add_f32_e64 v2, s17, 1.0
-; SI-NEXT:    v_add_f32_e64 v0, s16, 1.0
-; SI-NEXT:    v_add_f32_e64 v6, s19, 1.0
-; SI-NEXT:    v_add_f32_e64 v4, s18, 1.0
-; SI-NEXT:    v_add_f32_e64 v10, s21, 1.0
-; SI-NEXT:    v_add_f32_e64 v8, s20, 1.0
-; SI-NEXT:    v_add_f32_e64 v14, s23, 1.0
-; SI-NEXT:    v_add_f32_e64 v12, s22, 1.0
-; SI-NEXT:    v_alignbit_b32 v13, v14, v12, 16
-; SI-NEXT:    v_alignbit_b32 v9, v10, v8, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v4, 16
-; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-NEXT:    s_setpc_b64 s[30:31]
+; SI-NEXT:    v_add_f32_e64 v23, s17, 1.0
+; SI-NEXT:    v_add_f32_e64 v22, s16, 1.0
+; SI-NEXT:    v_add_f32_e64 v21, s19, 1.0
+; SI-NEXT:    v_add_f32_e64 v20, s18, 1.0
+; SI-NEXT:    v_add_f32_e64 v19, s21, 1.0
+; SI-NEXT:    v_add_f32_e64 v18, s20, 1.0
+; SI-NEXT:    v_add_f32_e64 v17, s23, 1.0
+; SI-NEXT:    v_add_f32_e64 v16, s22, 1.0
+; SI-NEXT:    v_lshr_b64 v[13:14], v[16:17], 16
+; SI-NEXT:    v_lshr_b64 v[9:10], v[18:19], 16
+; SI-NEXT:    v_lshr_b64 v[5:6], v[20:21], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[22:23], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v23
+; SI-NEXT:    s_branch .LBB37_5
 ; SI-NEXT:  .LBB37_3:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr14
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $sgpr9
+; SI-NEXT:    ; implicit-def: $sgpr15
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr24
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr25
 ; SI-NEXT:    s_branch .LBB37_2
 ; SI-NEXT:  .LBB37_4:
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v4, s18
-; SI-NEXT:    v_mov_b32_e32 v6, s19
-; SI-NEXT:    v_mov_b32_e32 v8, s20
-; SI-NEXT:    v_mov_b32_e32 v10, s21
-; SI-NEXT:    v_mov_b32_e32 v12, s22
-; SI-NEXT:    v_mov_b32_e32 v14, s23
-; SI-NEXT:    v_mov_b32_e32 v3, s6
-; SI-NEXT:    v_mov_b32_e32 v7, s7
-; SI-NEXT:    v_mov_b32_e32 v11, s8
-; SI-NEXT:    v_mov_b32_e32 v15, s9
+; SI-NEXT:    v_mov_b32_e32 v22, s16
+; SI-NEXT:    v_mov_b32_e32 v23, s17
+; SI-NEXT:    v_mov_b32_e32 v20, s18
+; SI-NEXT:    v_mov_b32_e32 v21, s19
+; SI-NEXT:    v_mov_b32_e32 v18, s20
+; SI-NEXT:    v_mov_b32_e32 v19, s21
+; SI-NEXT:    v_mov_b32_e32 v16, s22
+; SI-NEXT:    v_mov_b32_e32 v17, s23
+; SI-NEXT:    v_mov_b32_e32 v3, s14
+; SI-NEXT:    v_mov_b32_e32 v7, s15
+; SI-NEXT:    v_mov_b32_e32 v11, s24
+; SI-NEXT:    v_mov_b32_e32 v15, s25
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:    v_mov_b32_e32 v9, s6
+; SI-NEXT:    v_mov_b32_e32 v5, s8
+; SI-NEXT:    v_mov_b32_e32 v1, s10
+; SI-NEXT:  .LBB37_5: ; %end
+; SI-NEXT:    v_mov_b32_e32 v0, v22
+; SI-NEXT:    v_mov_b32_e32 v2, v23
+; SI-NEXT:    v_mov_b32_e32 v4, v20
+; SI-NEXT:    v_mov_b32_e32 v6, v21
+; SI-NEXT:    v_mov_b32_e32 v8, v18
+; SI-NEXT:    v_mov_b32_e32 v10, v19
+; SI-NEXT:    v_mov_b32_e32 v12, v16
+; SI-NEXT:    v_mov_b32_e32 v14, v17
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v8f32_to_v16i16_scalar:
@@ -12246,116 +12255,137 @@ define inreg <32 x i8> @bitcast_v8f32_to_v32i8_scalar(<8 x float> inreg %a, i32
 ; SI-NEXT:    s_cmp_lg_u32 s24, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB49_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v27, s23, v0, 24
-; SI-NEXT:    v_alignbit_b32 v26, s23, v0, 16
-; SI-NEXT:    v_alignbit_b32 v25, s23, v0, 8
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v19, s21, v0, 24
-; SI-NEXT:    v_alignbit_b32 v18, s21, v0, 16
-; SI-NEXT:    v_alignbit_b32 v17, s21, v0, 8
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v11, s19, v0, 24
-; SI-NEXT:    v_alignbit_b32 v10, s19, v0, 16
-; SI-NEXT:    v_alignbit_b32 v9, s19, v0, 8
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v3, s17, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, s17, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 8
-; SI-NEXT:    s_lshr_b32 s15, s23, 24
-; SI-NEXT:    s_lshr_b32 s24, s23, 16
-; SI-NEXT:    s_lshr_b32 s25, s23, 8
-; SI-NEXT:    s_lshr_b32 s12, s21, 24
-; SI-NEXT:    s_lshr_b32 s13, s21, 16
-; SI-NEXT:    s_lshr_b32 s14, s21, 8
-; SI-NEXT:    s_lshr_b32 s9, s19, 24
-; SI-NEXT:    s_lshr_b32 s10, s19, 16
-; SI-NEXT:    s_lshr_b32 s11, s19, 8
-; SI-NEXT:    s_lshr_b32 s6, s17, 24
-; SI-NEXT:    s_lshr_b32 s7, s17, 16
-; SI-NEXT:    s_lshr_b32 s8, s17, 8
+; SI-NEXT:    s_lshr_b32 s72, s23, 24
+; SI-NEXT:    s_lshr_b32 s74, s23, 16
+; SI-NEXT:    s_lshr_b32 s75, s23, 8
+; SI-NEXT:    s_lshr_b32 s61, s21, 24
+; SI-NEXT:    s_lshr_b32 s63, s21, 16
+; SI-NEXT:    s_lshr_b32 s73, s21, 8
+; SI-NEXT:    s_lshr_b32 s58, s19, 24
+; SI-NEXT:    s_lshr_b32 s60, s19, 16
+; SI-NEXT:    s_lshr_b32 s62, s19, 8
+; SI-NEXT:    s_lshr_b32 s56, s17, 24
+; SI-NEXT:    s_lshr_b32 s57, s17, 16
+; SI-NEXT:    s_lshr_b32 s59, s17, 8
+; SI-NEXT:    s_lshr_b64 s[40:41], s[22:23], 24
+; SI-NEXT:    s_lshr_b64 s[42:43], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[22:23], 8
+; SI-NEXT:    s_lshr_b64 s[24:25], s[20:21], 24
+; SI-NEXT:    s_lshr_b64 s[26:27], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[28:29], s[20:21], 8
+; SI-NEXT:    s_lshr_b64 s[10:11], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[12:13], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[4:5], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[6:7], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[16:17], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB49_4
 ; SI-NEXT:  .LBB49_2: ; %cmp.true
-; SI-NEXT:    v_add_f32_e64 v4, s17, 1.0
-; SI-NEXT:    v_add_f32_e64 v0, s16, 1.0
-; SI-NEXT:    v_add_f32_e64 v12, s19, 1.0
-; SI-NEXT:    v_add_f32_e64 v8, s18, 1.0
-; SI-NEXT:    v_add_f32_e64 v20, s21, 1.0
-; SI-NEXT:    v_add_f32_e64 v16, s20, 1.0
-; SI-NEXT:    v_add_f32_e64 v28, s23, 1.0
-; SI-NEXT:    v_add_f32_e64 v24, s22, 1.0
-; SI-NEXT:    v_alignbit_b32 v27, v28, v24, 24
-; SI-NEXT:    v_alignbit_b32 v26, v28, v24, 16
-; SI-NEXT:    v_alignbit_b32 v25, v28, v24, 8
-; SI-NEXT:    v_alignbit_b32 v19, v20, v16, 24
-; SI-NEXT:    v_alignbit_b32 v18, v20, v16, 16
-; SI-NEXT:    v_alignbit_b32 v17, v20, v16, 8
-; SI-NEXT:    v_alignbit_b32 v11, v12, v8, 24
-; SI-NEXT:    v_alignbit_b32 v10, v12, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, v12, v8, 8
-; SI-NEXT:    v_alignbit_b32 v3, v4, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, v4, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, v4, v0, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 24, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v29, 8, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 24, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 8, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; SI-NEXT:    s_setpc_b64 s[30:31]
+; SI-NEXT:    v_add_f32_e64 v39, s17, 1.0
+; SI-NEXT:    v_add_f32_e64 v38, s16, 1.0
+; SI-NEXT:    v_add_f32_e64 v49, s23, 1.0
+; SI-NEXT:    v_add_f32_e64 v48, s22, 1.0
+; SI-NEXT:    v_add_f32_e64 v35, s19, 1.0
+; SI-NEXT:    v_add_f32_e64 v34, s18, 1.0
+; SI-NEXT:    v_add_f32_e64 v37, s21, 1.0
+; SI-NEXT:    v_add_f32_e64 v36, s20, 1.0
+; SI-NEXT:    v_lshr_b64 v[27:28], v[48:49], 24
+; SI-NEXT:    v_lshr_b64 v[24:25], v[48:49], 16
+; SI-NEXT:    v_lshr_b64 v[0:1], v[38:39], 16
+; SI-NEXT:    v_lshr_b64 v[25:26], v[48:49], 8
+; SI-NEXT:    v_lshr_b64 v[19:20], v[36:37], 24
+; SI-NEXT:    v_lshr_b64 v[32:33], v[36:37], 16
+; SI-NEXT:    v_lshr_b64 v[17:18], v[36:37], 8
+; SI-NEXT:    v_lshr_b64 v[11:12], v[34:35], 24
+; SI-NEXT:    v_lshr_b64 v[28:29], v[34:35], 16
+; SI-NEXT:    v_lshr_b64 v[9:10], v[34:35], 8
+; SI-NEXT:    v_lshr_b64 v[3:4], v[38:39], 24
+; SI-NEXT:    v_lshr_b64 v[1:2], v[38:39], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 24, v49
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v49
+; SI-NEXT:    v_lshrrev_b32_e32 v29, 8, v49
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 24, v37
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v37
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 8, v37
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v35
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v35
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v35
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v39
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v39
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v39
+; SI-NEXT:    s_branch .LBB49_5
 ; SI-NEXT:  .LBB49_3:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $sgpr7
 ; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $sgpr11
-; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr19
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr59
+; SI-NEXT:    ; implicit-def: $sgpr57
+; SI-NEXT:    ; implicit-def: $sgpr56
 ; SI-NEXT:    ; implicit-def: $sgpr14
-; SI-NEXT:    ; implicit-def: $sgpr13
 ; SI-NEXT:    ; implicit-def: $sgpr12
-; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $sgpr25
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr62
+; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr58
+; SI-NEXT:    ; implicit-def: $sgpr28
+; SI-NEXT:    ; implicit-def: $sgpr26
 ; SI-NEXT:    ; implicit-def: $sgpr24
-; SI-NEXT:    ; implicit-def: $sgpr15
+; SI-NEXT:    ; implicit-def: $sgpr73
+; SI-NEXT:    ; implicit-def: $sgpr63
+; SI-NEXT:    ; implicit-def: $sgpr61
+; SI-NEXT:    ; implicit-def: $sgpr44
+; SI-NEXT:    ; implicit-def: $sgpr42
+; SI-NEXT:    ; implicit-def: $sgpr40
+; SI-NEXT:    ; implicit-def: $sgpr75
+; SI-NEXT:    ; implicit-def: $sgpr74
+; SI-NEXT:    ; implicit-def: $sgpr72
 ; SI-NEXT:    s_branch .LBB49_2
 ; SI-NEXT:  .LBB49_4:
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_mov_b32_e32 v4, s17
-; SI-NEXT:    v_mov_b32_e32 v8, s18
-; SI-NEXT:    v_mov_b32_e32 v12, s19
-; SI-NEXT:    v_mov_b32_e32 v16, s20
-; SI-NEXT:    v_mov_b32_e32 v20, s21
-; SI-NEXT:    v_mov_b32_e32 v24, s22
-; SI-NEXT:    v_mov_b32_e32 v28, s23
-; SI-NEXT:    v_mov_b32_e32 v5, s8
-; SI-NEXT:    v_mov_b32_e32 v6, s7
-; SI-NEXT:    v_mov_b32_e32 v7, s6
-; SI-NEXT:    v_mov_b32_e32 v13, s11
-; SI-NEXT:    v_mov_b32_e32 v14, s10
-; SI-NEXT:    v_mov_b32_e32 v15, s9
-; SI-NEXT:    v_mov_b32_e32 v21, s14
-; SI-NEXT:    v_mov_b32_e32 v22, s13
-; SI-NEXT:    v_mov_b32_e32 v23, s12
-; SI-NEXT:    v_mov_b32_e32 v29, s25
-; SI-NEXT:    v_mov_b32_e32 v30, s24
-; SI-NEXT:    v_mov_b32_e32 v31, s15
+; SI-NEXT:    v_mov_b32_e32 v38, s16
+; SI-NEXT:    v_mov_b32_e32 v39, s17
+; SI-NEXT:    v_mov_b32_e32 v34, s18
+; SI-NEXT:    v_mov_b32_e32 v35, s19
+; SI-NEXT:    v_mov_b32_e32 v36, s20
+; SI-NEXT:    v_mov_b32_e32 v37, s21
+; SI-NEXT:    v_mov_b32_e32 v48, s22
+; SI-NEXT:    v_mov_b32_e32 v49, s23
+; SI-NEXT:    v_mov_b32_e32 v5, s59
+; SI-NEXT:    v_mov_b32_e32 v6, s57
+; SI-NEXT:    v_mov_b32_e32 v7, s56
+; SI-NEXT:    v_mov_b32_e32 v13, s62
+; SI-NEXT:    v_mov_b32_e32 v14, s60
+; SI-NEXT:    v_mov_b32_e32 v15, s58
+; SI-NEXT:    v_mov_b32_e32 v21, s73
+; SI-NEXT:    v_mov_b32_e32 v22, s63
+; SI-NEXT:    v_mov_b32_e32 v23, s61
+; SI-NEXT:    v_mov_b32_e32 v29, s75
+; SI-NEXT:    v_mov_b32_e32 v30, s74
+; SI-NEXT:    v_mov_b32_e32 v31, s72
+; SI-NEXT:    v_mov_b32_e32 v27, s40
+; SI-NEXT:    v_mov_b32_e32 v24, s42
+; SI-NEXT:    v_mov_b32_e32 v25, s44
+; SI-NEXT:    v_mov_b32_e32 v19, s24
+; SI-NEXT:    v_mov_b32_e32 v32, s26
+; SI-NEXT:    v_mov_b32_e32 v17, s28
+; SI-NEXT:    v_mov_b32_e32 v11, s10
+; SI-NEXT:    v_mov_b32_e32 v28, s12
+; SI-NEXT:    v_mov_b32_e32 v9, s14
+; SI-NEXT:    v_mov_b32_e32 v3, s4
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s8
+; SI-NEXT:  .LBB49_5: ; %end
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    v_mov_b32_e32 v0, v38
+; SI-NEXT:    v_mov_b32_e32 v4, v39
+; SI-NEXT:    v_mov_b32_e32 v10, v28
+; SI-NEXT:    v_mov_b32_e32 v8, v34
+; SI-NEXT:    v_mov_b32_e32 v12, v35
+; SI-NEXT:    v_mov_b32_e32 v18, v32
+; SI-NEXT:    v_mov_b32_e32 v16, v36
+; SI-NEXT:    v_mov_b32_e32 v20, v37
+; SI-NEXT:    v_mov_b32_e32 v26, v24
+; SI-NEXT:    v_mov_b32_e32 v24, v48
+; SI-NEXT:    v_mov_b32_e32 v28, v49
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v8f32_to_v32i8_scalar:
@@ -15064,63 +15094,59 @@ define inreg <16 x i16> @bitcast_v4i64_to_v16i16_scalar(<4 x i64> inreg %a, i32
 ; SI-NEXT:    s_cmp_lg_u32 s24, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB57_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v13, s23, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s23, 16
-; SI-NEXT:    s_lshr_b32 s7, s21, 16
-; SI-NEXT:    s_lshr_b32 s8, s19, 16
-; SI-NEXT:    s_lshr_b32 s9, s17, 16
+; SI-NEXT:    s_lshr_b32 s14, s23, 16
+; SI-NEXT:    s_lshr_b32 s15, s21, 16
+; SI-NEXT:    s_lshr_b32 s24, s19, 16
+; SI-NEXT:    s_lshr_b32 s25, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB57_3
 ; SI-NEXT:  .LBB57_2: ; %cmp.true
-; SI-NEXT:    s_add_u32 s16, s16, 3
-; SI-NEXT:    s_addc_u32 s17, s17, 0
-; SI-NEXT:    s_add_u32 s18, s18, 3
-; SI-NEXT:    s_addc_u32 s19, s19, 0
-; SI-NEXT:    s_add_u32 s20, s20, 3
-; SI-NEXT:    s_addc_u32 s21, s21, 0
 ; SI-NEXT:    s_add_u32 s22, s22, 3
 ; SI-NEXT:    s_addc_u32 s23, s23, 0
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v13, s23, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s23, 16
-; SI-NEXT:    s_lshr_b32 s7, s21, 16
-; SI-NEXT:    s_lshr_b32 s8, s19, 16
-; SI-NEXT:    s_lshr_b32 s9, s17, 16
+; SI-NEXT:    s_add_u32 s20, s20, 3
+; SI-NEXT:    s_addc_u32 s21, s21, 0
+; SI-NEXT:    s_add_u32 s18, s18, 3
+; SI-NEXT:    s_addc_u32 s19, s19, 0
+; SI-NEXT:    s_add_u32 s16, s16, 3
+; SI-NEXT:    s_addc_u32 s17, s17, 0
+; SI-NEXT:    s_lshr_b32 s14, s23, 16
+; SI-NEXT:    s_lshr_b32 s15, s21, 16
+; SI-NEXT:    s_lshr_b32 s24, s19, 16
+; SI-NEXT:    s_lshr_b32 s25, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[16:17], 16
 ; SI-NEXT:  .LBB57_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
+; SI-NEXT:    v_mov_b32_e32 v1, s10
 ; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v3, s9
+; SI-NEXT:    v_mov_b32_e32 v3, s25
 ; SI-NEXT:    v_mov_b32_e32 v4, s18
+; SI-NEXT:    v_mov_b32_e32 v5, s8
 ; SI-NEXT:    v_mov_b32_e32 v6, s19
-; SI-NEXT:    v_mov_b32_e32 v7, s8
+; SI-NEXT:    v_mov_b32_e32 v7, s24
 ; SI-NEXT:    v_mov_b32_e32 v8, s20
+; SI-NEXT:    v_mov_b32_e32 v9, s6
 ; SI-NEXT:    v_mov_b32_e32 v10, s21
-; SI-NEXT:    v_mov_b32_e32 v11, s7
+; SI-NEXT:    v_mov_b32_e32 v11, s15
 ; SI-NEXT:    v_mov_b32_e32 v12, s22
+; SI-NEXT:    v_mov_b32_e32 v13, s4
 ; SI-NEXT:    v_mov_b32_e32 v14, s23
-; SI-NEXT:    v_mov_b32_e32 v15, s6
+; SI-NEXT:    v_mov_b32_e32 v15, s14
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB57_4:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr25
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $sgpr24
 ; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr15
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr14
 ; SI-NEXT:    s_branch .LBB57_2
 ;
 ; VI-LABEL: bitcast_v4i64_to_v16i16_scalar:
@@ -18815,119 +18841,123 @@ define inreg <32 x i8> @bitcast_v4i64_to_v32i8_scalar(<4 x i64> inreg %a, i32 in
 ; SI-NEXT:    s_cmp_lg_u32 s24, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB69_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v27, s23, v0, 24
-; SI-NEXT:    v_alignbit_b32 v26, s23, v0, 16
-; SI-NEXT:    v_alignbit_b32 v25, s23, v0, 8
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v19, s21, v0, 24
-; SI-NEXT:    v_alignbit_b32 v18, s21, v0, 16
-; SI-NEXT:    v_alignbit_b32 v17, s21, v0, 8
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v11, s19, v0, 24
-; SI-NEXT:    v_alignbit_b32 v10, s19, v0, 16
-; SI-NEXT:    v_alignbit_b32 v9, s19, v0, 8
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v3, s17, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, s17, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 8
-; SI-NEXT:    s_lshr_b32 s6, s23, 24
-; SI-NEXT:    s_lshr_b32 s7, s23, 16
-; SI-NEXT:    s_lshr_b32 s9, s23, 8
-; SI-NEXT:    s_lshr_b32 s8, s21, 24
-; SI-NEXT:    s_lshr_b32 s10, s21, 16
-; SI-NEXT:    s_lshr_b32 s12, s21, 8
-; SI-NEXT:    s_lshr_b32 s11, s19, 24
-; SI-NEXT:    s_lshr_b32 s13, s19, 16
-; SI-NEXT:    s_lshr_b32 s14, s19, 8
-; SI-NEXT:    s_lshr_b32 s15, s17, 24
-; SI-NEXT:    s_lshr_b32 s24, s17, 16
-; SI-NEXT:    s_lshr_b32 s25, s17, 8
+; SI-NEXT:    s_lshr_b32 s56, s23, 24
+; SI-NEXT:    s_lshr_b32 s57, s23, 16
+; SI-NEXT:    s_lshr_b32 s58, s23, 8
+; SI-NEXT:    s_lshr_b32 s59, s21, 24
+; SI-NEXT:    s_lshr_b32 s60, s21, 16
+; SI-NEXT:    s_lshr_b32 s61, s21, 8
+; SI-NEXT:    s_lshr_b32 s62, s19, 24
+; SI-NEXT:    s_lshr_b32 s63, s19, 16
+; SI-NEXT:    s_lshr_b32 s72, s19, 8
+; SI-NEXT:    s_lshr_b32 s73, s17, 24
+; SI-NEXT:    s_lshr_b32 s74, s17, 16
+; SI-NEXT:    s_lshr_b32 s75, s17, 8
+; SI-NEXT:    s_lshr_b64 s[4:5], s[22:23], 24
+; SI-NEXT:    s_lshr_b64 s[6:7], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[22:23], 8
+; SI-NEXT:    s_lshr_b64 s[10:11], s[20:21], 24
+; SI-NEXT:    s_lshr_b64 s[12:13], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[20:21], 8
+; SI-NEXT:    s_lshr_b64 s[24:25], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[26:27], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[28:29], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[40:41], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[42:43], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[16:17], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB69_3
 ; SI-NEXT:  .LBB69_2: ; %cmp.true
-; SI-NEXT:    s_add_u32 s16, s16, 3
-; SI-NEXT:    s_addc_u32 s17, s17, 0
-; SI-NEXT:    s_add_u32 s18, s18, 3
-; SI-NEXT:    s_addc_u32 s19, s19, 0
-; SI-NEXT:    s_add_u32 s20, s20, 3
-; SI-NEXT:    s_addc_u32 s21, s21, 0
 ; SI-NEXT:    s_add_u32 s22, s22, 3
 ; SI-NEXT:    s_addc_u32 s23, s23, 0
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v27, s23, v0, 24
-; SI-NEXT:    v_alignbit_b32 v26, s23, v0, 16
-; SI-NEXT:    v_alignbit_b32 v25, s23, v0, 8
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v19, s21, v0, 24
-; SI-NEXT:    v_alignbit_b32 v18, s21, v0, 16
-; SI-NEXT:    v_alignbit_b32 v17, s21, v0, 8
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v11, s19, v0, 24
-; SI-NEXT:    v_alignbit_b32 v10, s19, v0, 16
-; SI-NEXT:    v_alignbit_b32 v9, s19, v0, 8
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v3, s17, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, s17, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 8
-; SI-NEXT:    s_lshr_b32 s6, s23, 24
-; SI-NEXT:    s_lshr_b32 s7, s23, 16
-; SI-NEXT:    s_lshr_b32 s9, s23, 8
-; SI-NEXT:    s_lshr_b32 s8, s21, 24
-; SI-NEXT:    s_lshr_b32 s10, s21, 16
-; SI-NEXT:    s_lshr_b32 s12, s21, 8
-; SI-NEXT:    s_lshr_b32 s11, s19, 24
-; SI-NEXT:    s_lshr_b32 s13, s19, 16
-; SI-NEXT:    s_lshr_b32 s14, s19, 8
-; SI-NEXT:    s_lshr_b32 s15, s17, 24
-; SI-NEXT:    s_lshr_b32 s24, s17, 16
-; SI-NEXT:    s_lshr_b32 s25, s17, 8
+; SI-NEXT:    s_add_u32 s20, s20, 3
+; SI-NEXT:    s_addc_u32 s21, s21, 0
+; SI-NEXT:    s_add_u32 s18, s18, 3
+; SI-NEXT:    s_addc_u32 s19, s19, 0
+; SI-NEXT:    s_add_u32 s16, s16, 3
+; SI-NEXT:    s_addc_u32 s17, s17, 0
+; SI-NEXT:    s_lshr_b32 s56, s23, 24
+; SI-NEXT:    s_lshr_b32 s57, s23, 16
+; SI-NEXT:    s_lshr_b32 s58, s23, 8
+; SI-NEXT:    s_lshr_b32 s59, s21, 24
+; SI-NEXT:    s_lshr_b32 s60, s21, 16
+; SI-NEXT:    s_lshr_b32 s61, s21, 8
+; SI-NEXT:    s_lshr_b32 s62, s19, 24
+; SI-NEXT:    s_lshr_b32 s63, s19, 16
+; SI-NEXT:    s_lshr_b32 s72, s19, 8
+; SI-NEXT:    s_lshr_b32 s73, s17, 24
+; SI-NEXT:    s_lshr_b32 s74, s17, 16
+; SI-NEXT:    s_lshr_b32 s75, s17, 8
+; SI-NEXT:    s_lshr_b64 s[4:5], s[22:23], 24
+; SI-NEXT:    s_lshr_b64 s[6:7], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[22:23], 8
+; SI-NEXT:    s_lshr_b64 s[10:11], s[20:21], 24
+; SI-NEXT:    s_lshr_b64 s[12:13], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[20:21], 8
+; SI-NEXT:    s_lshr_b64 s[24:25], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[26:27], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[28:29], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[40:41], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[42:43], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[16:17], 8
 ; SI-NEXT:  .LBB69_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
+; SI-NEXT:    v_mov_b32_e32 v1, s44
+; SI-NEXT:    v_mov_b32_e32 v2, s42
+; SI-NEXT:    v_mov_b32_e32 v3, s40
 ; SI-NEXT:    v_mov_b32_e32 v4, s17
-; SI-NEXT:    v_mov_b32_e32 v5, s25
-; SI-NEXT:    v_mov_b32_e32 v6, s24
-; SI-NEXT:    v_mov_b32_e32 v7, s15
+; SI-NEXT:    v_mov_b32_e32 v5, s75
+; SI-NEXT:    v_mov_b32_e32 v6, s74
+; SI-NEXT:    v_mov_b32_e32 v7, s73
 ; SI-NEXT:    v_mov_b32_e32 v8, s18
+; SI-NEXT:    v_mov_b32_e32 v9, s28
+; SI-NEXT:    v_mov_b32_e32 v10, s26
+; SI-NEXT:    v_mov_b32_e32 v11, s24
 ; SI-NEXT:    v_mov_b32_e32 v12, s19
-; SI-NEXT:    v_mov_b32_e32 v13, s14
-; SI-NEXT:    v_mov_b32_e32 v14, s13
-; SI-NEXT:    v_mov_b32_e32 v15, s11
+; SI-NEXT:    v_mov_b32_e32 v13, s72
+; SI-NEXT:    v_mov_b32_e32 v14, s63
+; SI-NEXT:    v_mov_b32_e32 v15, s62
 ; SI-NEXT:    v_mov_b32_e32 v16, s20
+; SI-NEXT:    v_mov_b32_e32 v17, s14
+; SI-NEXT:    v_mov_b32_e32 v18, s12
+; SI-NEXT:    v_mov_b32_e32 v19, s10
 ; SI-NEXT:    v_mov_b32_e32 v20, s21
-; SI-NEXT:    v_mov_b32_e32 v21, s12
-; SI-NEXT:    v_mov_b32_e32 v22, s10
-; SI-NEXT:    v_mov_b32_e32 v23, s8
+; SI-NEXT:    v_mov_b32_e32 v21, s61
+; SI-NEXT:    v_mov_b32_e32 v22, s60
+; SI-NEXT:    v_mov_b32_e32 v23, s59
 ; SI-NEXT:    v_mov_b32_e32 v24, s22
+; SI-NEXT:    v_mov_b32_e32 v25, s8
+; SI-NEXT:    v_mov_b32_e32 v26, s6
+; SI-NEXT:    v_mov_b32_e32 v27, s4
 ; SI-NEXT:    v_mov_b32_e32 v28, s23
-; SI-NEXT:    v_mov_b32_e32 v29, s9
-; SI-NEXT:    v_mov_b32_e32 v30, s7
-; SI-NEXT:    v_mov_b32_e32 v31, s6
+; SI-NEXT:    v_mov_b32_e32 v29, s58
+; SI-NEXT:    v_mov_b32_e32 v30, s57
+; SI-NEXT:    v_mov_b32_e32 v31, s56
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB69_4:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $sgpr25
+; SI-NEXT:    ; implicit-def: $sgpr44
+; SI-NEXT:    ; implicit-def: $sgpr42
+; SI-NEXT:    ; implicit-def: $sgpr40
+; SI-NEXT:    ; implicit-def: $sgpr75
+; SI-NEXT:    ; implicit-def: $sgpr74
+; SI-NEXT:    ; implicit-def: $sgpr73
+; SI-NEXT:    ; implicit-def: $sgpr28
+; SI-NEXT:    ; implicit-def: $sgpr26
 ; SI-NEXT:    ; implicit-def: $sgpr24
-; SI-NEXT:    ; implicit-def: $sgpr15
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $sgpr72
+; SI-NEXT:    ; implicit-def: $sgpr63
+; SI-NEXT:    ; implicit-def: $sgpr62
 ; SI-NEXT:    ; implicit-def: $sgpr14
-; SI-NEXT:    ; implicit-def: $sgpr13
-; SI-NEXT:    ; implicit-def: $sgpr11
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr19
 ; SI-NEXT:    ; implicit-def: $sgpr12
 ; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr61
+; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr59
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $sgpr7
 ; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr58
+; SI-NEXT:    ; implicit-def: $sgpr57
+; SI-NEXT:    ; implicit-def: $sgpr56
 ; SI-NEXT:    s_branch .LBB69_2
 ;
 ; VI-LABEL: bitcast_v4i64_to_v32i8_scalar:
@@ -21155,56 +21185,56 @@ define inreg <16 x i16> @bitcast_v4f64_to_v16i16_scalar(<4 x double> inreg %a, i
 ; SI-NEXT:    s_cmp_lg_u32 s24, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB73_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v13, s23, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s9, s23, 16
-; SI-NEXT:    s_lshr_b32 s8, s21, 16
-; SI-NEXT:    s_lshr_b32 s7, s19, 16
-; SI-NEXT:    s_lshr_b32 s6, s17, 16
+; SI-NEXT:    s_lshr_b32 s25, s23, 16
+; SI-NEXT:    s_lshr_b32 s24, s21, 16
+; SI-NEXT:    s_lshr_b32 s15, s19, 16
+; SI-NEXT:    s_lshr_b32 s14, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB73_4
 ; SI-NEXT:  .LBB73_2: ; %cmp.true
-; SI-NEXT:    v_add_f64 v[22:23], s[16:17], 1.0
-; SI-NEXT:    v_add_f64 v[20:21], s[18:19], 1.0
 ; SI-NEXT:    v_add_f64 v[16:17], s[22:23], 1.0
 ; SI-NEXT:    v_add_f64 v[18:19], s[20:21], 1.0
-; SI-NEXT:    v_alignbit_b32 v13, v17, v16, 16
-; SI-NEXT:    v_alignbit_b32 v9, v19, v18, 16
-; SI-NEXT:    v_alignbit_b32 v5, v21, v20, 16
-; SI-NEXT:    v_alignbit_b32 v1, v23, v22, 16
+; SI-NEXT:    v_add_f64 v[20:21], s[18:19], 1.0
+; SI-NEXT:    v_add_f64 v[22:23], s[16:17], 1.0
+; SI-NEXT:    v_lshr_b64 v[13:14], v[16:17], 16
+; SI-NEXT:    v_lshr_b64 v[9:10], v[18:19], 16
+; SI-NEXT:    v_lshr_b64 v[5:6], v[20:21], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[22:23], 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v17
 ; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v19
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v21
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v23
 ; SI-NEXT:    s_branch .LBB73_5
 ; SI-NEXT:  .LBB73_3:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr14
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $sgpr9
+; SI-NEXT:    ; implicit-def: $sgpr15
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr24
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr25
 ; SI-NEXT:    s_branch .LBB73_2
 ; SI-NEXT:  .LBB73_4:
-; SI-NEXT:    v_mov_b32_e32 v23, s17
-; SI-NEXT:    v_mov_b32_e32 v21, s19
-; SI-NEXT:    v_mov_b32_e32 v19, s21
 ; SI-NEXT:    v_mov_b32_e32 v17, s23
-; SI-NEXT:    v_mov_b32_e32 v16, s22
-; SI-NEXT:    v_mov_b32_e32 v18, s20
-; SI-NEXT:    v_mov_b32_e32 v20, s18
+; SI-NEXT:    v_mov_b32_e32 v19, s21
+; SI-NEXT:    v_mov_b32_e32 v21, s19
+; SI-NEXT:    v_mov_b32_e32 v23, s17
 ; SI-NEXT:    v_mov_b32_e32 v22, s16
-; SI-NEXT:    v_mov_b32_e32 v3, s6
-; SI-NEXT:    v_mov_b32_e32 v7, s7
-; SI-NEXT:    v_mov_b32_e32 v11, s8
-; SI-NEXT:    v_mov_b32_e32 v15, s9
+; SI-NEXT:    v_mov_b32_e32 v20, s18
+; SI-NEXT:    v_mov_b32_e32 v18, s20
+; SI-NEXT:    v_mov_b32_e32 v16, s22
+; SI-NEXT:    v_mov_b32_e32 v15, s25
+; SI-NEXT:    v_mov_b32_e32 v11, s24
+; SI-NEXT:    v_mov_b32_e32 v7, s15
+; SI-NEXT:    v_mov_b32_e32 v3, s14
+; SI-NEXT:    v_mov_b32_e32 v1, s10
+; SI-NEXT:    v_mov_b32_e32 v5, s8
+; SI-NEXT:    v_mov_b32_e32 v9, s6
+; SI-NEXT:    v_mov_b32_e32 v13, s4
 ; SI-NEXT:  .LBB73_5: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, v22
 ; SI-NEXT:    v_mov_b32_e32 v2, v23
@@ -24819,121 +24849,133 @@ define inreg <32 x i8> @bitcast_v4f64_to_v32i8_scalar(<4 x double> inreg %a, i32
 ; SI-NEXT:    s_cmp_lg_u32 s24, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB85_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v27, s23, v0, 24
-; SI-NEXT:    v_alignbit_b32 v26, s23, v0, 16
-; SI-NEXT:    v_alignbit_b32 v32, s23, v0, 8
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v19, s21, v0, 24
-; SI-NEXT:    v_alignbit_b32 v18, s21, v0, 16
-; SI-NEXT:    v_alignbit_b32 v33, s21, v0, 8
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v11, s19, v0, 24
-; SI-NEXT:    v_alignbit_b32 v10, s19, v0, 16
-; SI-NEXT:    v_alignbit_b32 v34, s19, v0, 8
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v3, s17, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, s17, v0, 16
-; SI-NEXT:    v_alignbit_b32 v35, s17, v0, 8
-; SI-NEXT:    s_lshr_b32 s25, s23, 24
-; SI-NEXT:    s_lshr_b32 s24, s23, 16
-; SI-NEXT:    s_lshr_b32 s15, s23, 8
-; SI-NEXT:    s_lshr_b32 s14, s21, 24
-; SI-NEXT:    s_lshr_b32 s13, s21, 16
-; SI-NEXT:    s_lshr_b32 s12, s21, 8
-; SI-NEXT:    s_lshr_b32 s11, s19, 24
-; SI-NEXT:    s_lshr_b32 s10, s19, 16
-; SI-NEXT:    s_lshr_b32 s9, s19, 8
-; SI-NEXT:    s_lshr_b32 s8, s17, 24
-; SI-NEXT:    s_lshr_b32 s7, s17, 16
-; SI-NEXT:    s_lshr_b32 s6, s17, 8
+; SI-NEXT:    s_lshr_b32 s75, s23, 24
+; SI-NEXT:    s_lshr_b32 s74, s23, 16
+; SI-NEXT:    s_lshr_b32 s73, s23, 8
+; SI-NEXT:    s_lshr_b32 s72, s21, 24
+; SI-NEXT:    s_lshr_b32 s63, s21, 16
+; SI-NEXT:    s_lshr_b32 s62, s21, 8
+; SI-NEXT:    s_lshr_b32 s61, s19, 24
+; SI-NEXT:    s_lshr_b32 s60, s19, 16
+; SI-NEXT:    s_lshr_b32 s59, s19, 8
+; SI-NEXT:    s_lshr_b32 s58, s17, 24
+; SI-NEXT:    s_lshr_b32 s57, s17, 16
+; SI-NEXT:    s_lshr_b32 s56, s17, 8
+; SI-NEXT:    s_lshr_b64 s[4:5], s[22:23], 24
+; SI-NEXT:    s_lshr_b64 s[6:7], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[22:23], 8
+; SI-NEXT:    s_lshr_b64 s[10:11], s[20:21], 24
+; SI-NEXT:    s_lshr_b64 s[12:13], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[20:21], 8
+; SI-NEXT:    s_lshr_b64 s[24:25], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[26:27], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[28:29], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[40:41], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[42:43], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[16:17], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB85_4
 ; SI-NEXT:  .LBB85_2: ; %cmp.true
-; SI-NEXT:    v_add_f64 v[0:1], s[16:17], 1.0
-; SI-NEXT:    v_add_f64 v[8:9], s[18:19], 1.0
-; SI-NEXT:    v_add_f64 v[24:25], s[22:23], 1.0
-; SI-NEXT:    v_add_f64 v[16:17], s[20:21], 1.0
-; SI-NEXT:    v_alignbit_b32 v27, v25, v24, 24
-; SI-NEXT:    v_alignbit_b32 v26, v25, v24, 16
-; SI-NEXT:    v_alignbit_b32 v32, v25, v24, 8
-; SI-NEXT:    v_alignbit_b32 v19, v17, v16, 24
-; SI-NEXT:    v_alignbit_b32 v18, v17, v16, 16
-; SI-NEXT:    v_alignbit_b32 v33, v17, v16, 8
-; SI-NEXT:    v_alignbit_b32 v11, v9, v8, 24
-; SI-NEXT:    v_alignbit_b32 v10, v9, v8, 16
-; SI-NEXT:    v_alignbit_b32 v34, v9, v8, 8
-; SI-NEXT:    v_alignbit_b32 v3, v1, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, v1, v0, 16
-; SI-NEXT:    v_alignbit_b32 v35, v1, v0, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 24, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v29, 8, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 24, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 8, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; SI-NEXT:    v_add_f64 v[50:51], s[22:23], 1.0
+; SI-NEXT:    v_add_f64 v[37:38], s[20:21], 1.0
+; SI-NEXT:    v_add_f64 v[48:49], s[16:17], 1.0
+; SI-NEXT:    v_add_f64 v[35:36], s[18:19], 1.0
+; SI-NEXT:    v_lshr_b64 v[24:25], v[50:51], 16
+; SI-NEXT:    v_lshr_b64 v[32:33], v[37:38], 16
+; SI-NEXT:    v_lshr_b64 v[0:1], v[48:49], 16
+; SI-NEXT:    v_lshr_b64 v[27:28], v[50:51], 24
+; SI-NEXT:    v_lshr_b64 v[25:26], v[50:51], 8
+; SI-NEXT:    v_lshr_b64 v[19:20], v[37:38], 24
+; SI-NEXT:    v_lshr_b64 v[17:18], v[37:38], 8
+; SI-NEXT:    v_lshr_b64 v[11:12], v[35:36], 24
+; SI-NEXT:    v_lshr_b64 v[33:34], v[35:36], 16
+; SI-NEXT:    v_lshr_b64 v[9:10], v[35:36], 8
+; SI-NEXT:    v_lshr_b64 v[3:4], v[48:49], 24
+; SI-NEXT:    v_lshr_b64 v[1:2], v[48:49], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 24, v51
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v51
+; SI-NEXT:    v_lshrrev_b32_e32 v29, 8, v51
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 24, v38
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v38
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 8, v38
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v36
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v36
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v36
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v49
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v49
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v49
 ; SI-NEXT:    s_branch .LBB85_5
 ; SI-NEXT:  .LBB85_3:
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr34
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $sgpr11
-; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $sgpr12
-; SI-NEXT:    ; implicit-def: $sgpr13
-; SI-NEXT:    ; implicit-def: $sgpr14
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $sgpr15
+; SI-NEXT:    ; implicit-def: $sgpr44
+; SI-NEXT:    ; implicit-def: $sgpr42
+; SI-NEXT:    ; implicit-def: $sgpr40
+; SI-NEXT:    ; implicit-def: $sgpr56
+; SI-NEXT:    ; implicit-def: $sgpr57
+; SI-NEXT:    ; implicit-def: $sgpr58
+; SI-NEXT:    ; implicit-def: $sgpr28
+; SI-NEXT:    ; implicit-def: $sgpr26
 ; SI-NEXT:    ; implicit-def: $sgpr24
-; SI-NEXT:    ; implicit-def: $sgpr25
+; SI-NEXT:    ; implicit-def: $sgpr59
+; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr61
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr62
+; SI-NEXT:    ; implicit-def: $sgpr63
+; SI-NEXT:    ; implicit-def: $sgpr72
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr73
+; SI-NEXT:    ; implicit-def: $sgpr74
+; SI-NEXT:    ; implicit-def: $sgpr75
 ; SI-NEXT:    s_branch .LBB85_2
 ; SI-NEXT:  .LBB85_4:
-; SI-NEXT:    v_mov_b32_e32 v1, s17
-; SI-NEXT:    v_mov_b32_e32 v9, s19
-; SI-NEXT:    v_mov_b32_e32 v17, s21
-; SI-NEXT:    v_mov_b32_e32 v25, s23
-; SI-NEXT:    v_mov_b32_e32 v24, s22
-; SI-NEXT:    v_mov_b32_e32 v16, s20
-; SI-NEXT:    v_mov_b32_e32 v8, s18
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_mov_b32_e32 v7, s8
-; SI-NEXT:    v_mov_b32_e32 v6, s7
-; SI-NEXT:    v_mov_b32_e32 v5, s6
-; SI-NEXT:    v_mov_b32_e32 v15, s11
-; SI-NEXT:    v_mov_b32_e32 v14, s10
-; SI-NEXT:    v_mov_b32_e32 v13, s9
-; SI-NEXT:    v_mov_b32_e32 v23, s14
-; SI-NEXT:    v_mov_b32_e32 v22, s13
-; SI-NEXT:    v_mov_b32_e32 v21, s12
-; SI-NEXT:    v_mov_b32_e32 v31, s25
-; SI-NEXT:    v_mov_b32_e32 v30, s24
-; SI-NEXT:    v_mov_b32_e32 v29, s15
+; SI-NEXT:    v_mov_b32_e32 v51, s23
+; SI-NEXT:    v_mov_b32_e32 v38, s21
+; SI-NEXT:    v_mov_b32_e32 v36, s19
+; SI-NEXT:    v_mov_b32_e32 v49, s17
+; SI-NEXT:    v_mov_b32_e32 v48, s16
+; SI-NEXT:    v_mov_b32_e32 v35, s18
+; SI-NEXT:    v_mov_b32_e32 v37, s20
+; SI-NEXT:    v_mov_b32_e32 v50, s22
+; SI-NEXT:    v_mov_b32_e32 v31, s75
+; SI-NEXT:    v_mov_b32_e32 v30, s74
+; SI-NEXT:    v_mov_b32_e32 v29, s73
+; SI-NEXT:    v_mov_b32_e32 v23, s72
+; SI-NEXT:    v_mov_b32_e32 v22, s63
+; SI-NEXT:    v_mov_b32_e32 v21, s62
+; SI-NEXT:    v_mov_b32_e32 v15, s61
+; SI-NEXT:    v_mov_b32_e32 v14, s60
+; SI-NEXT:    v_mov_b32_e32 v13, s59
+; SI-NEXT:    v_mov_b32_e32 v7, s58
+; SI-NEXT:    v_mov_b32_e32 v6, s57
+; SI-NEXT:    v_mov_b32_e32 v5, s56
+; SI-NEXT:    v_mov_b32_e32 v1, s44
+; SI-NEXT:    v_mov_b32_e32 v0, s42
+; SI-NEXT:    v_mov_b32_e32 v3, s40
+; SI-NEXT:    v_mov_b32_e32 v9, s28
+; SI-NEXT:    v_mov_b32_e32 v33, s26
+; SI-NEXT:    v_mov_b32_e32 v11, s24
+; SI-NEXT:    v_mov_b32_e32 v17, s14
+; SI-NEXT:    v_mov_b32_e32 v32, s12
+; SI-NEXT:    v_mov_b32_e32 v19, s10
+; SI-NEXT:    v_mov_b32_e32 v25, s8
+; SI-NEXT:    v_mov_b32_e32 v24, s6
+; SI-NEXT:    v_mov_b32_e32 v27, s4
 ; SI-NEXT:  .LBB85_5: ; %end
-; SI-NEXT:    v_mov_b32_e32 v4, v1
-; SI-NEXT:    v_mov_b32_e32 v12, v9
-; SI-NEXT:    v_mov_b32_e32 v20, v17
-; SI-NEXT:    v_mov_b32_e32 v28, v25
-; SI-NEXT:    v_mov_b32_e32 v1, v35
-; SI-NEXT:    v_mov_b32_e32 v9, v34
-; SI-NEXT:    v_mov_b32_e32 v17, v33
-; SI-NEXT:    v_mov_b32_e32 v25, v32
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    v_mov_b32_e32 v0, v48
+; SI-NEXT:    v_mov_b32_e32 v4, v49
+; SI-NEXT:    v_mov_b32_e32 v10, v33
+; SI-NEXT:    v_mov_b32_e32 v8, v35
+; SI-NEXT:    v_mov_b32_e32 v12, v36
+; SI-NEXT:    v_mov_b32_e32 v18, v32
+; SI-NEXT:    v_mov_b32_e32 v16, v37
+; SI-NEXT:    v_mov_b32_e32 v20, v38
+; SI-NEXT:    v_mov_b32_e32 v26, v24
+; SI-NEXT:    v_mov_b32_e32 v24, v50
+; SI-NEXT:    v_mov_b32_e32 v28, v51
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v4f64_to_v32i8_scalar:
@@ -27681,26 +27723,24 @@ define inreg <16 x i16> @bitcast_v16f16_to_v16i16_scalar(<16 x half> inreg %a, i
 ; SI-LABEL: bitcast_v16f16_to_v16i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v15, v1
-; SI-NEXT:    v_mov_b32_e32 v14, v0
-; SI-NEXT:    v_mov_b32_e32 v16, v2
+; SI-NEXT:    v_mov_b32_e32 v5, v0
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, s16
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, s17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, s18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, s19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v21, s21
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s24
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, s25
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, s26
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, s25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, s27
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, s28
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, s29
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, s26
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, s29
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_cbranch_scc0 .LBB91_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
@@ -27713,16 +27753,12 @@ define inreg <16 x i16> @bitcast_v16f16_to_v16i16_scalar(<16 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v18
 ; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v15
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v21
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v19
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v16
 ; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
 ; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
@@ -27730,51 +27766,59 @@ define inreg <16 x i16> @bitcast_v16f16_to_v16i16_scalar(<16 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
 ; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
-; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
+; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
+; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT:    v_or_b32_e32 v14, v14, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
+; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_or_b32_e32 v10, v10, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT:    v_or_b32_e32 v14, v14, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v11
-; SI-NEXT:    v_or_b32_e32 v10, v10, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v7
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v16
 ; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v2, v2, v16
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_or_b32_e32 v2, v2, v16
+; SI-NEXT:    v_lshr_b64 v[18:19], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[21:22], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[19:20], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[16:17], v[13:14], 16
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v5
 ; SI-NEXT:    v_or_b32_e32 v8, v8, v9
 ; SI-NEXT:    v_or_b32_e32 v12, v12, v13
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v9, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v13, v14, v13, 16
 ; SI-NEXT:  .LBB91_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v1, v18
+; SI-NEXT:    v_mov_b32_e32 v5, v21
+; SI-NEXT:    v_mov_b32_e32 v9, v19
+; SI-NEXT:    v_mov_b32_e32 v13, v16
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB91_4:
 ; SI-NEXT:    s_branch .LBB91_2
@@ -29151,115 +29195,119 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_mul_f32_e64 v31, 1.0, s16
 ; SI-NEXT:    v_mul_f32_e64 v30, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v17, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v20, 1.0, s18
 ; SI-NEXT:    v_mul_f32_e64 v16, 1.0, s19
 ; SI-NEXT:    v_mul_f32_e64 v29, 1.0, s20
 ; SI-NEXT:    v_mul_f32_e64 v28, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v19, 1.0, s22
-; SI-NEXT:    v_mul_f32_e64 v18, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v22, 1.0, s22
+; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s23
 ; SI-NEXT:    v_mul_f32_e64 v27, 1.0, s24
 ; SI-NEXT:    v_mul_f32_e64 v26, 1.0, s25
-; SI-NEXT:    v_mul_f32_e64 v21, 1.0, s26
-; SI-NEXT:    v_mul_f32_e64 v20, 1.0, s27
+; SI-NEXT:    v_mul_f32_e64 v23, 1.0, s26
+; SI-NEXT:    v_mul_f32_e64 v9, 1.0, s27
 ; SI-NEXT:    v_mul_f32_e64 v25, 1.0, s28
 ; SI-NEXT:    v_mul_f32_e64 v24, 1.0, s29
-; SI-NEXT:    v_mul_f32_e32 v23, 1.0, v0
-; SI-NEXT:    v_mul_f32_e32 v22, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v13, 1.0, v0
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    s_cbranch_scc0 .LBB95_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v30
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v20
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v29
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v28
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
 ; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v27
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v21
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
 ; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v13
 ; SI-NEXT:    s_cbranch_execnz .LBB95_3
 ; SI-NEXT:  .LBB95_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v30
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v30
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v31
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v12
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v28
 ; SI-NEXT:    v_alignbit_b32 v0, v2, v0, 16
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v29
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v26
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v26
 ; SI-NEXT:    v_alignbit_b32 v4, v4, v2, 16
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v27
-; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
-; SI-NEXT:    v_alignbit_b32 v8, v6, v2, 16
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v24
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v25
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; SI-NEXT:    v_alignbit_b32 v12, v7, v2, 16
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v22
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v23
-; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT:    v_alignbit_b32 v8, v7, v2, 16
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v25
+; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v24
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v6
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v20
-; SI-NEXT:    v_alignbit_b32 v14, v15, v2, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v21
-; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v2
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v13
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v23
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v18
-; SI-NEXT:    v_alignbit_b32 v10, v11, v2, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v19
-; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
+; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT:    v_alignbit_b32 v10, v11, v1, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v22
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v16
-; SI-NEXT:    v_alignbit_b32 v6, v7, v2, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v17
-; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_alignbit_b32 v14, v15, v7, 16
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v16
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v6
+; SI-NEXT:    v_alignbit_b32 v6, v7, v1, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v20
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v9, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v13, v14, v13, 16
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-NEXT:    v_alignbit_b32 v2, v3, v1, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v12
+; SI-NEXT:    v_lshr_b64 v[17:18], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[18:19], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[21:22], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[19:20], v[13:14], 16
+; SI-NEXT:    v_alignbit_b32 v12, v24, v25, 16
 ; SI-NEXT:  .LBB95_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v1, v17
+; SI-NEXT:    v_mov_b32_e32 v5, v18
+; SI-NEXT:    v_mov_b32_e32 v9, v21
+; SI-NEXT:    v_mov_b32_e32 v13, v19
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB95_4:
 ; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr17
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $vgpr18
 ; SI-NEXT:    ; implicit-def: $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr7
 ; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $vgpr21
 ; SI-NEXT:    ; implicit-def: $vgpr10
 ; SI-NEXT:    ; implicit-def: $vgpr11
 ; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr14
 ; SI-NEXT:    ; implicit-def: $vgpr15
+; SI-NEXT:    ; implicit-def: $vgpr19
+; SI-NEXT:    ; implicit-def: $vgpr14
 ; SI-NEXT:    s_branch .LBB95_2
 ;
 ; VI-LABEL: bitcast_v16bf16_to_v16i16_scalar:
@@ -30446,80 +30494,83 @@ define inreg <32 x i8> @bitcast_v16i16_to_v32i8_scalar(<16 x i16> inreg %a, i32
 ; SI-LABEL: bitcast_v16i16_to_v32i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v1
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; SI-NEXT:    v_readfirstlane_b32 s78, v1
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
+; SI-NEXT:    v_readfirstlane_b32 s79, v0
 ; SI-NEXT:    s_cbranch_scc0 .LBB97_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s17, 16
-; SI-NEXT:    s_or_b32 s11, s4, s5
-; SI-NEXT:    s_and_b32 s4, s18, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s19, 16
-; SI-NEXT:    s_or_b32 s12, s4, s5
-; SI-NEXT:    s_and_b32 s4, s20, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s21, 16
-; SI-NEXT:    s_or_b32 s6, s4, s5
-; SI-NEXT:    s_and_b32 s4, s22, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s23, 16
-; SI-NEXT:    s_or_b32 s9, s4, s5
-; SI-NEXT:    s_and_b32 s4, s24, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s25, 16
-; SI-NEXT:    v_mov_b32_e32 v6, s6
-; SI-NEXT:    s_or_b32 s7, s4, s5
-; SI-NEXT:    s_and_b32 s4, s26, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s27, 16
-; SI-NEXT:    v_alignbit_b32 v11, s9, v6, 24
-; SI-NEXT:    v_alignbit_b32 v10, s9, v6, 16
-; SI-NEXT:    v_alignbit_b32 v9, s9, v6, 8
-; SI-NEXT:    s_or_b32 s10, s4, s5
-; SI-NEXT:    v_mov_b32_e32 v6, s7
-; SI-NEXT:    v_alignbit_b32 v19, s10, v6, 24
-; SI-NEXT:    v_alignbit_b32 v18, s10, v6, 16
-; SI-NEXT:    v_alignbit_b32 v17, s10, v6, 8
-; SI-NEXT:    s_and_b32 s4, s28, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s29, 16
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v0
-; SI-NEXT:    v_mov_b32_e32 v1, s11
-; SI-NEXT:    s_or_b32 s8, s4, s5
-; SI-NEXT:    v_or_b32_e32 v28, v6, v5
-; SI-NEXT:    v_alignbit_b32 v3, s12, v1, 24
-; SI-NEXT:    v_alignbit_b32 v2, s12, v1, 16
-; SI-NEXT:    v_alignbit_b32 v1, s12, v1, 8
-; SI-NEXT:    v_alignbit_b32 v27, v28, s8, 24
-; SI-NEXT:    v_alignbit_b32 v26, v28, s8, 16
-; SI-NEXT:    v_alignbit_b32 v25, v28, s8, 8
-; SI-NEXT:    s_lshr_b32 s44, s12, 8
-; SI-NEXT:    s_lshr_b32 s14, s9, 8
-; SI-NEXT:    s_lshr_b32 s41, s10, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v29, 8, v28
-; SI-NEXT:    s_and_b32 s45, s19, 0xffff
-; SI-NEXT:    s_and_b32 s15, s23, 0xffff
-; SI-NEXT:    s_and_b32 s42, s27, 0xffff
-; SI-NEXT:    v_and_b32_e32 v30, 0xffff, v4
-; SI-NEXT:    s_bfe_u32 s13, s19, 0x80008
-; SI-NEXT:    s_bfe_u32 s40, s23, 0x80008
-; SI-NEXT:    s_bfe_u32 s43, s27, 0x80008
-; SI-NEXT:    v_bfe_u32 v31, v4, 8, 8
+; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    s_and_b32 s5, s18, 0xffff
+; SI-NEXT:    s_lshl_b32 s6, s19, 16
+; SI-NEXT:    s_or_b32 s5, s5, s6
+; SI-NEXT:    s_and_b32 s6, s20, 0xffff
+; SI-NEXT:    s_lshl_b32 s7, s21, 16
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    s_and_b32 s7, s22, 0xffff
+; SI-NEXT:    s_lshl_b32 s8, s23, 16
+; SI-NEXT:    s_or_b32 s7, s7, s8
+; SI-NEXT:    s_and_b32 s8, s24, 0xffff
+; SI-NEXT:    s_lshl_b32 s9, s25, 16
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    s_and_b32 s9, s26, 0xffff
+; SI-NEXT:    s_lshl_b32 s10, s27, 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[4:5], 24
+; SI-NEXT:    s_or_b32 s9, s9, s10
+; SI-NEXT:    s_and_b32 s10, s28, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s29, 16
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    s_and_b32 s11, s79, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s78, 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[4:5], 8
+; SI-NEXT:    s_lshr_b64 s[42:43], s[6:7], 24
+; SI-NEXT:    s_lshr_b64 s[44:45], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[6:7], 8
+; SI-NEXT:    s_lshr_b64 s[56:57], s[8:9], 24
+; SI-NEXT:    s_lshr_b64 s[58:59], s[8:9], 16
+; SI-NEXT:    s_or_b32 s11, s11, s13
+; SI-NEXT:    s_lshr_b32 s13, s5, 8
+; SI-NEXT:    s_lshr_b32 s41, s7, 8
+; SI-NEXT:    s_lshr_b32 s47, s9, 8
+; SI-NEXT:    s_lshr_b32 s88, s11, 8
+; SI-NEXT:    s_and_b32 s15, s19, 0xffff
+; SI-NEXT:    s_and_b32 s45, s23, 0xffff
+; SI-NEXT:    s_and_b32 s59, s27, 0xffff
+; SI-NEXT:    s_and_b32 s90, s78, 0xffff
+; SI-NEXT:    s_bfe_u32 s43, s19, 0x80008
+; SI-NEXT:    s_bfe_u32 s57, s23, 0x80008
+; SI-NEXT:    s_bfe_u32 s89, s27, 0x80008
+; SI-NEXT:    s_bfe_u32 s91, s78, 0x80008
+; SI-NEXT:    s_lshr_b64 s[60:61], s[8:9], 8
+; SI-NEXT:    s_lshr_b64 s[72:73], s[10:11], 24
+; SI-NEXT:    s_lshr_b64 s[62:63], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[74:75], s[10:11], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB97_3
 ; SI-NEXT:  .LBB97_2: ; %cmp.true
 ; SI-NEXT:    s_add_i32 s28, s28, 3
 ; SI-NEXT:    s_and_b32 s4, s28, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s29, 16
 ; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    s_add_i32 s79, s79, 3
+; SI-NEXT:    s_add_i32 s10, s4, 0x30000
+; SI-NEXT:    s_and_b32 s4, s79, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s78, 16
+; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s24, s24, 3
-; SI-NEXT:    s_add_i32 s8, s4, 0x30000
+; SI-NEXT:    s_add_i32 s11, s4, 0x30000
 ; SI-NEXT:    s_and_b32 s4, s24, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s25, 16
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s26, s26, 3
-; SI-NEXT:    s_add_i32 s7, s4, 0x30000
+; SI-NEXT:    s_add_i32 s8, s4, 0x30000
 ; SI-NEXT:    s_and_b32 s4, s26, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s27, 16
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s20, s20, 3
-; SI-NEXT:    s_add_i32 s10, s4, 0x30000
+; SI-NEXT:    s_add_i32 s9, s4, 0x30000
 ; SI-NEXT:    s_and_b32 s4, s20, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s21, 16
 ; SI-NEXT:    s_or_b32 s4, s5, s4
@@ -30529,99 +30580,103 @@ define inreg <32 x i8> @bitcast_v16i16_to_v32i8_scalar(<16 x i16> inreg %a, i32
 ; SI-NEXT:    s_lshl_b32 s5, s23, 16
 ; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s16, s16, 3
-; SI-NEXT:    s_add_i32 s9, s4, 0x30000
+; SI-NEXT:    s_add_i32 s7, s4, 0x30000
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s17, 16
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
-; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s18, s18, 3
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    s_add_i32 s11, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s18, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s19, 16
-; SI-NEXT:    v_or_b32_e32 v0, v5, v0
 ; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_mov_b32_e32 v4, s6
-; SI-NEXT:    v_add_i32_e32 v28, vcc, 0x30000, v0
-; SI-NEXT:    v_mov_b32_e32 v0, s8
-; SI-NEXT:    s_add_i32 s12, s4, 0x30000
-; SI-NEXT:    v_mov_b32_e32 v1, s11
-; SI-NEXT:    v_alignbit_b32 v11, s9, v4, 24
-; SI-NEXT:    v_alignbit_b32 v10, s9, v4, 16
-; SI-NEXT:    v_alignbit_b32 v9, s9, v4, 8
-; SI-NEXT:    v_mov_b32_e32 v4, s7
-; SI-NEXT:    v_alignbit_b32 v3, s12, v1, 24
-; SI-NEXT:    v_alignbit_b32 v2, s12, v1, 16
-; SI-NEXT:    v_alignbit_b32 v1, s12, v1, 8
-; SI-NEXT:    v_alignbit_b32 v19, s10, v4, 24
-; SI-NEXT:    v_alignbit_b32 v18, s10, v4, 16
-; SI-NEXT:    v_alignbit_b32 v17, s10, v4, 8
-; SI-NEXT:    v_alignbit_b32 v27, v28, v0, 24
-; SI-NEXT:    v_alignbit_b32 v26, v28, v0, 16
-; SI-NEXT:    v_alignbit_b32 v25, v28, v0, 8
-; SI-NEXT:    s_lshr_b32 s13, s12, 24
-; SI-NEXT:    s_lshr_b32 s45, s12, 16
-; SI-NEXT:    s_lshr_b32 s44, s12, 8
-; SI-NEXT:    s_lshr_b32 s40, s9, 24
-; SI-NEXT:    s_lshr_b32 s15, s9, 16
-; SI-NEXT:    s_lshr_b32 s14, s9, 8
-; SI-NEXT:    s_lshr_b32 s43, s10, 24
-; SI-NEXT:    s_lshr_b32 s42, s10, 16
-; SI-NEXT:    s_lshr_b32 s41, s10, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 24, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v29, 8, v28
+; SI-NEXT:    s_and_b32 s5, s18, 0xffff
+; SI-NEXT:    s_lshl_b32 s12, s19, 16
+; SI-NEXT:    s_or_b32 s5, s12, s5
+; SI-NEXT:    s_add_i32 s4, s4, 0x30000
+; SI-NEXT:    s_add_i32 s5, s5, 0x30000
+; SI-NEXT:    s_lshr_b64 s[12:13], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[14:15], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[4:5], 8
+; SI-NEXT:    s_lshr_b64 s[42:43], s[6:7], 24
+; SI-NEXT:    s_lshr_b64 s[44:45], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[6:7], 8
+; SI-NEXT:    s_lshr_b64 s[56:57], s[8:9], 24
+; SI-NEXT:    s_lshr_b64 s[58:59], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[8:9], 8
+; SI-NEXT:    s_lshr_b64 s[72:73], s[10:11], 24
+; SI-NEXT:    s_lshr_b64 s[62:63], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[74:75], s[10:11], 8
+; SI-NEXT:    s_lshr_b32 s43, s5, 24
+; SI-NEXT:    s_lshr_b32 s15, s5, 16
+; SI-NEXT:    s_lshr_b32 s13, s5, 8
+; SI-NEXT:    s_lshr_b32 s57, s7, 24
+; SI-NEXT:    s_lshr_b32 s45, s7, 16
+; SI-NEXT:    s_lshr_b32 s41, s7, 8
+; SI-NEXT:    s_lshr_b32 s89, s9, 24
+; SI-NEXT:    s_lshr_b32 s59, s9, 16
+; SI-NEXT:    s_lshr_b32 s47, s9, 8
+; SI-NEXT:    s_lshr_b32 s91, s11, 24
+; SI-NEXT:    s_lshr_b32 s90, s11, 16
+; SI-NEXT:    s_lshr_b32 s88, s11, 8
 ; SI-NEXT:  .LBB97_3: ; %end
-; SI-NEXT:    v_mov_b32_e32 v0, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s12
-; SI-NEXT:    v_mov_b32_e32 v5, s44
-; SI-NEXT:    v_mov_b32_e32 v6, s45
-; SI-NEXT:    v_mov_b32_e32 v7, s13
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s40
+; SI-NEXT:    v_mov_b32_e32 v2, s14
+; SI-NEXT:    v_mov_b32_e32 v3, s12
+; SI-NEXT:    v_mov_b32_e32 v4, s5
+; SI-NEXT:    v_mov_b32_e32 v5, s13
+; SI-NEXT:    v_mov_b32_e32 v6, s15
+; SI-NEXT:    v_mov_b32_e32 v7, s43
 ; SI-NEXT:    v_mov_b32_e32 v8, s6
-; SI-NEXT:    v_mov_b32_e32 v12, s9
-; SI-NEXT:    v_mov_b32_e32 v13, s14
-; SI-NEXT:    v_mov_b32_e32 v14, s15
-; SI-NEXT:    v_mov_b32_e32 v15, s40
-; SI-NEXT:    v_mov_b32_e32 v16, s7
-; SI-NEXT:    v_mov_b32_e32 v20, s10
-; SI-NEXT:    v_mov_b32_e32 v21, s41
-; SI-NEXT:    v_mov_b32_e32 v22, s42
-; SI-NEXT:    v_mov_b32_e32 v23, s43
-; SI-NEXT:    v_mov_b32_e32 v24, s8
+; SI-NEXT:    v_mov_b32_e32 v9, s46
+; SI-NEXT:    v_mov_b32_e32 v10, s44
+; SI-NEXT:    v_mov_b32_e32 v11, s42
+; SI-NEXT:    v_mov_b32_e32 v12, s7
+; SI-NEXT:    v_mov_b32_e32 v13, s41
+; SI-NEXT:    v_mov_b32_e32 v14, s45
+; SI-NEXT:    v_mov_b32_e32 v15, s57
+; SI-NEXT:    v_mov_b32_e32 v16, s8
+; SI-NEXT:    v_mov_b32_e32 v17, s60
+; SI-NEXT:    v_mov_b32_e32 v18, s58
+; SI-NEXT:    v_mov_b32_e32 v19, s56
+; SI-NEXT:    v_mov_b32_e32 v20, s9
+; SI-NEXT:    v_mov_b32_e32 v21, s47
+; SI-NEXT:    v_mov_b32_e32 v22, s59
+; SI-NEXT:    v_mov_b32_e32 v23, s89
+; SI-NEXT:    v_mov_b32_e32 v24, s10
+; SI-NEXT:    v_mov_b32_e32 v25, s74
+; SI-NEXT:    v_mov_b32_e32 v26, s62
+; SI-NEXT:    v_mov_b32_e32 v27, s72
+; SI-NEXT:    v_mov_b32_e32 v28, s11
+; SI-NEXT:    v_mov_b32_e32 v29, s88
+; SI-NEXT:    v_mov_b32_e32 v30, s90
+; SI-NEXT:    v_mov_b32_e32 v31, s91
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB97_4:
-; SI-NEXT:    ; implicit-def: $sgpr11
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr40
+; SI-NEXT:    ; implicit-def: $sgpr14
 ; SI-NEXT:    ; implicit-def: $sgpr12
-; SI-NEXT:    ; implicit-def: $sgpr44
-; SI-NEXT:    ; implicit-def: $sgpr45
 ; SI-NEXT:    ; implicit-def: $sgpr13
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $sgpr14
 ; SI-NEXT:    ; implicit-def: $sgpr15
-; SI-NEXT:    ; implicit-def: $sgpr40
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $sgpr41
-; SI-NEXT:    ; implicit-def: $sgpr42
 ; SI-NEXT:    ; implicit-def: $sgpr43
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr44
+; SI-NEXT:    ; implicit-def: $sgpr42
+; SI-NEXT:    ; implicit-def: $sgpr41
+; SI-NEXT:    ; implicit-def: $sgpr45
+; SI-NEXT:    ; implicit-def: $sgpr57
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr58
+; SI-NEXT:    ; implicit-def: $sgpr56
+; SI-NEXT:    ; implicit-def: $sgpr47
+; SI-NEXT:    ; implicit-def: $sgpr59
+; SI-NEXT:    ; implicit-def: $sgpr89
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr74
+; SI-NEXT:    ; implicit-def: $sgpr62
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    ; implicit-def: $sgpr90
+; SI-NEXT:    ; implicit-def: $sgpr91
+; SI-NEXT:    ; implicit-def: $sgpr72
 ; SI-NEXT:    s_branch .LBB97_2
 ;
 ; VI-LABEL: bitcast_v16i16_to_v32i8_scalar:
@@ -32019,234 +32074,229 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
-; SI-NEXT:    v_mov_b32_e32 v19, v14
-; SI-NEXT:    v_mov_b32_e32 v20, v12
-; SI-NEXT:    v_readfirstlane_b32 s13, v11
-; SI-NEXT:    v_readfirstlane_b32 s14, v10
-; SI-NEXT:    v_readfirstlane_b32 s9, v3
-; SI-NEXT:    v_readfirstlane_b32 s10, v2
-; SI-NEXT:    v_readfirstlane_b32 s7, v1
-; SI-NEXT:    v_readfirstlane_b32 s6, v0
+; SI-NEXT:    v_mov_b32_e32 v22, v14
+; SI-NEXT:    v_mov_b32_e32 v21, v10
+; SI-NEXT:    v_readfirstlane_b32 s43, v1
+; SI-NEXT:    v_readfirstlane_b32 s42, v0
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 8, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 8, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 24, v13
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 8, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v24, 24, v13
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v15
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v17
 ; SI-NEXT:    s_cbranch_scc0 .LBB99_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_and_b32 s4, s20, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s21, 8
+; SI-NEXT:    s_and_b32 s4, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s17, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s22, 0xff
+; SI-NEXT:    s_and_b32 s5, s18, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s8, s23, 24
+; SI-NEXT:    s_lshl_b32 s6, s19, 24
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s8, s5
-; SI-NEXT:    s_or_b32 s11, s4, s5
-; SI-NEXT:    s_and_b32 s4, s18, 0xff
-; SI-NEXT:    s_lshl_b32 s4, s4, 16
-; SI-NEXT:    s_lshl_b32 s8, s19, 24
-; SI-NEXT:    s_or_b32 s4, s8, s4
-; SI-NEXT:    s_and_b32 s8, s28, 0xff
-; SI-NEXT:    s_lshl_b32 s12, s29, 8
-; SI-NEXT:    s_or_b32 s8, s8, s12
-; SI-NEXT:    s_and_b32 s12, s6, 0xff
-; SI-NEXT:    s_lshl_b32 s12, s12, 16
-; SI-NEXT:    s_lshl_b32 s15, s7, 24
-; SI-NEXT:    s_or_b32 s41, s15, s12
-; SI-NEXT:    s_and_b32 s12, s26, 0xff
-; SI-NEXT:    s_lshl_b32 s12, s12, 16
-; SI-NEXT:    s_lshl_b32 s15, s27, 24
-; SI-NEXT:    s_or_b32 s12, s15, s12
-; SI-NEXT:    s_and_b32 s15, s16, 0xff
-; SI-NEXT:    s_lshl_b32 s40, s17, 8
-; SI-NEXT:    v_and_b32_e32 v9, 0xff, v6
-; SI-NEXT:    v_and_b32_e32 v10, 0xff, v8
-; SI-NEXT:    s_or_b32 s15, s15, s40
-; SI-NEXT:    v_or_b32_e32 v9, v9, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT:    s_and_b32 s15, s15, 0xffff
-; SI-NEXT:    v_mov_b32_e32 v1, s4
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; SI-NEXT:    v_or_b32_e32 v11, v0, v10
-; SI-NEXT:    s_or_b32 s15, s15, s4
+; SI-NEXT:    s_or_b32 s12, s6, s5
+; SI-NEXT:    s_or_b32 s6, s4, s12
 ; SI-NEXT:    s_and_b32 s4, s24, 0xff
-; SI-NEXT:    s_lshl_b32 s40, s25, 8
-; SI-NEXT:    v_or_b32_e32 v10, v9, v11
+; SI-NEXT:    s_lshl_b32 s5, s25, 8
+; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    s_and_b32 s5, s26, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    s_lshl_b32 s7, s27, 24
+; SI-NEXT:    s_or_b32 s14, s7, s5
+; SI-NEXT:    s_and_b32 s5, s20, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s21, 8
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    s_and_b32 s7, s22, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_lshl_b32 s8, s23, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s7, s8, s7
+; SI-NEXT:    s_or_b32 s13, s5, s7
+; SI-NEXT:    s_lshr_b64 s[8:9], s[12:13], 16
+; SI-NEXT:    v_and_b32_e32 v10, 0xff, v12
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v8
+; SI-NEXT:    s_and_b32 s5, s28, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s29, 8
+; SI-NEXT:    v_and_b32_e32 v9, 0xff, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_and_b32_e32 v17, 0xff, v16
+; SI-NEXT:    s_or_b32 s5, s5, s9
+; SI-NEXT:    s_and_b32 s9, s42, 0xff
+; SI-NEXT:    v_or_b32_e32 v9, v9, v23
+; SI-NEXT:    v_or_b32_e32 v13, v24, v10
+; SI-NEXT:    v_and_b32_e32 v10, 0xff, v6
+; SI-NEXT:    v_or_b32_e32 v15, v0, v14
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s10, s43, 24
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v9
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v4
-; SI-NEXT:    s_or_b32 s4, s4, s40
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_mov_b32_e32 v5, s12
-; SI-NEXT:    v_or_b32_e32 v12, v3, v9
-; SI-NEXT:    v_and_b32_e32 v9, 0xff, v19
-; SI-NEXT:    v_and_b32_e32 v13, 0xff, v16
-; SI-NEXT:    s_or_b32 s12, s4, s12
-; SI-NEXT:    s_and_b32 s4, s10, 0xff
-; SI-NEXT:    s_lshl_b32 s40, s9, 8
-; SI-NEXT:    v_or_b32_e32 v9, v9, v21
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    s_or_b32 s4, s4, s40
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; SI-NEXT:    v_or_b32_e32 v15, v7, v13
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v14, v9, v15
-; SI-NEXT:    v_and_b32_e32 v9, 0xff, v20
-; SI-NEXT:    v_or_b32_e32 v18, s4, v12
-; SI-NEXT:    s_and_b32 s4, s14, 0xff
-; SI-NEXT:    s_lshl_b32 s40, s13, 8
-; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    v_or_b32_e32 v10, v10, v1
+; SI-NEXT:    v_or_b32_e32 v14, v14, v7
+; SI-NEXT:    v_or_b32_e32 v26, v5, v17
+; SI-NEXT:    v_and_b32_e32 v17, 0xff, v21
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s12, s10, s9
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    s_or_b32 s4, s4, s40
-; SI-NEXT:    s_or_b32 s8, s8, s41
-; SI-NEXT:    v_or_b32_e32 v22, v17, v9
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; SI-NEXT:    v_or_b32_e32 v17, v17, v25
+; SI-NEXT:    s_or_b32 s15, s5, s12
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_alignbit_b32 v1, s11, v1, 16
-; SI-NEXT:    v_alignbit_b32 v5, s8, v5, 16
-; SI-NEXT:    v_alignbit_b32 v9, v10, v12, 16
-; SI-NEXT:    v_alignbit_b32 v13, v14, v22, 16
-; SI-NEXT:    v_or_b32_e32 v12, s4, v22
-; SI-NEXT:    s_lshr_b32 s40, s5, 16
-; SI-NEXT:    s_lshr_b32 s41, s41, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; SI-NEXT:    v_or_b32_e32 v9, v3, v9
+; SI-NEXT:    v_or_b32_e32 v10, v10, v15
+; SI-NEXT:    v_or_b32_e32 v14, v14, v26
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; SI-NEXT:    s_lshr_b64 s[10:11], s[14:15], 16
+; SI-NEXT:    s_or_b32 s4, s4, s14
+; SI-NEXT:    v_or_b32_e32 v19, v11, v9
+; SI-NEXT:    v_mov_b32_e32 v20, v10
+; SI-NEXT:    v_lshr_b64 v[9:10], v[9:10], 16
+; SI-NEXT:    v_or_b32_e32 v17, v17, v13
+; SI-NEXT:    v_mov_b32_e32 v18, v14
+; SI-NEXT:    v_lshr_b64 v[13:14], v[13:14], 16
+; SI-NEXT:    s_lshr_b32 s9, s7, 16
+; SI-NEXT:    s_lshr_b32 s11, s12, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v26
+; SI-NEXT:    s_mov_b32 s7, s13
+; SI-NEXT:    s_mov_b32 s5, s15
 ; SI-NEXT:    s_cbranch_execnz .LBB99_3
 ; SI-NEXT:  .LBB99_2: ; %cmp.true
-; SI-NEXT:    s_add_i32 s14, s14, 3
-; SI-NEXT:    s_and_b32 s4, s14, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s13, 8
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v20
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v1, v17, v1
-; SI-NEXT:    v_or_b32_e32 v1, s4, v1
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 0x3000000, v1
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v19
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v16
-; SI-NEXT:    v_or_b32_e32 v1, v21, v1
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x300, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v5, v7, v5
-; SI-NEXT:    v_or_b32_e32 v1, v5, v1
-; SI-NEXT:    s_add_i32 s10, s10, 3
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 0x3000000, v1
-; SI-NEXT:    s_and_b32 s4, s10, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s9, 8
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v4
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    s_add_i32 s24, s24, 3
-; SI-NEXT:    v_or_b32_e32 v1, s4, v1
 ; SI-NEXT:    s_and_b32 s4, s24, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s25, 8
 ; SI-NEXT:    s_add_i32 s26, s26, 3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_and_b32 s8, s26, 0xff
+; SI-NEXT:    s_and_b32 s6, s26, 0xff
 ; SI-NEXT:    s_addk_i32 s4, 0x300
 ; SI-NEXT:    s_lshl_b32 s5, s27, 24
-; SI-NEXT:    s_lshl_b32 s8, s8, 16
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s8
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s28, s28, 3
-; SI-NEXT:    s_add_i32 s12, s4, 0x3000000
-; SI-NEXT:    s_and_b32 s4, s28, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s29, 8
-; SI-NEXT:    s_add_i32 s6, s6, 3
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_and_b32 s6, s6, 0xff
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s7, 24
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s5, s6
+; SI-NEXT:    s_add_i32 s28, s28, 3
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v21
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
 ; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    s_and_b32 s5, s28, 0xff
+; SI-NEXT:    s_lshl_b32 s6, s29, 8
+; SI-NEXT:    s_add_i32 s42, s42, 3
+; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; SI-NEXT:    v_add_i32_e32 v10, vcc, 3, v12
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 3, v4
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    s_and_b32 s7, s42, 0xff
+; SI-NEXT:    v_or_b32_e32 v9, v25, v9
+; SI-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; SI-NEXT:    v_or_b32_e32 v2, v23, v2
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; SI-NEXT:    s_addk_i32 s5, 0x300
+; SI-NEXT:    s_lshl_b32 s6, s43, 24
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 0x300, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x300, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s6, s6, s7
 ; SI-NEXT:    s_add_i32 s16, s16, 3
-; SI-NEXT:    s_add_i32 s8, s4, 0x3000000
-; SI-NEXT:    s_and_b32 s4, s16, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s17, 8
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; SI-NEXT:    v_or_b32_e32 v10, v24, v10
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_or_b32_e32 v3, v3, v4
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    s_and_b32 s6, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s17, 8
 ; SI-NEXT:    s_add_i32 s18, s18, 3
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_and_b32 s6, s18, 0xff
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s19, 24
-; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    v_add_i32_e32 v18, vcc, 0x3000000, v1
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v6
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s6
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    v_or_b32_e32 v9, v10, v9
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    s_or_b32 s6, s7, s6
+; SI-NEXT:    s_and_b32 s8, s18, 0xff
+; SI-NEXT:    v_add_i32_e32 v17, vcc, 0x3000000, v9
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v22
+; SI-NEXT:    v_add_i32_e32 v19, vcc, 0x3000000, v2
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v6
+; SI-NEXT:    s_addk_i32 s6, 0x300
+; SI-NEXT:    s_lshl_b32 s7, s19, 24
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_or_b32 s7, s7, s8
 ; SI-NEXT:    s_add_i32 s20, s20, 3
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    v_or_b32_e32 v7, v7, v9
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v16
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v8
-; SI-NEXT:    s_add_i32 s15, s4, 0x3000000
-; SI-NEXT:    s_and_b32 s4, s20, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s21, 8
+; SI-NEXT:    s_or_b32 s6, s7, s6
+; SI-NEXT:    s_and_b32 s7, s20, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s21, 8
 ; SI-NEXT:    s_add_i32 s22, s22, 3
+; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_and_b32 s6, s22, 0xff
+; SI-NEXT:    s_or_b32 s7, s8, s7
+; SI-NEXT:    s_and_b32 s9, s22, 0xff
+; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x300, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x300, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s23, 24
-; SI-NEXT:    s_lshl_b32 s6, s6, 16
+; SI-NEXT:    s_addk_i32 s7, 0x300
+; SI-NEXT:    s_lshl_b32 s8, s23, 24
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; SI-NEXT:    v_or_b32_e32 v5, v5, v9
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v2
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s6
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    v_or_b32_e32 v5, v5, v7
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 0x3000000, v0
-; SI-NEXT:    s_add_i32 s11, s4, 0x3000000
-; SI-NEXT:    v_mov_b32_e32 v0, s15
-; SI-NEXT:    v_alignbit_b32 v1, s11, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s12
-; SI-NEXT:    v_alignbit_b32 v5, s8, v0, 16
-; SI-NEXT:    v_alignbit_b32 v9, v10, v18, 16
-; SI-NEXT:    v_alignbit_b32 v13, v14, v12, 16
-; SI-NEXT:    s_lshr_b32 s40, s11, 16
-; SI-NEXT:    s_lshr_b32 s41, s8, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
+; SI-NEXT:    s_or_b32 s7, s8, s7
+; SI-NEXT:    v_add_i32_e32 v18, vcc, 0x3000000, v5
+; SI-NEXT:    v_add_i32_e32 v20, vcc, 0x3000000, v0
+; SI-NEXT:    s_add_i32 s4, s4, 0x3000000
+; SI-NEXT:    s_add_i32 s5, s5, 0x3000000
+; SI-NEXT:    s_add_i32 s6, s6, 0x3000000
+; SI-NEXT:    s_add_i32 s7, s7, 0x3000000
+; SI-NEXT:    s_lshr_b64 s[8:9], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[4:5], 16
+; SI-NEXT:    v_lshr_b64 v[9:10], v[19:20], 16
+; SI-NEXT:    v_lshr_b64 v[13:14], v[17:18], 16
+; SI-NEXT:    s_lshr_b32 s9, s7, 16
+; SI-NEXT:    s_lshr_b32 s11, s5, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v18
 ; SI-NEXT:  .LBB99_3: ; %end
-; SI-NEXT:    v_mov_b32_e32 v0, s15
-; SI-NEXT:    v_mov_b32_e32 v2, s11
-; SI-NEXT:    v_mov_b32_e32 v3, s40
-; SI-NEXT:    v_mov_b32_e32 v4, s12
-; SI-NEXT:    v_mov_b32_e32 v6, s8
-; SI-NEXT:    v_mov_b32_e32 v7, s41
-; SI-NEXT:    v_mov_b32_e32 v8, v18
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    v_mov_b32_e32 v3, s9
+; SI-NEXT:    v_mov_b32_e32 v4, s4
+; SI-NEXT:    v_mov_b32_e32 v5, s10
+; SI-NEXT:    v_mov_b32_e32 v6, s5
+; SI-NEXT:    v_mov_b32_e32 v7, s11
+; SI-NEXT:    v_mov_b32_e32 v8, v19
+; SI-NEXT:    v_mov_b32_e32 v10, v20
+; SI-NEXT:    v_mov_b32_e32 v12, v17
+; SI-NEXT:    v_mov_b32_e32 v14, v18
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB99_4:
-; SI-NEXT:    ; implicit-def: $sgpr15
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr11
-; SI-NEXT:    ; implicit-def: $sgpr40
-; SI-NEXT:    ; implicit-def: $sgpr12
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $sgpr6
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $sgpr41
-; SI-NEXT:    ; implicit-def: $vgpr18
+; SI-NEXT:    ; implicit-def: $sgpr9
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr11
+; SI-NEXT:    ; implicit-def: $vgpr19
 ; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr10
 ; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr12
+; SI-NEXT:    ; implicit-def: $vgpr17
 ; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr14
 ; SI-NEXT:    ; implicit-def: $vgpr15
 ; SI-NEXT:    s_branch .LBB99_2
 ;
@@ -35384,116 +35434,121 @@ define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32
 ; SI-LABEL: bitcast_v16f16_to_v32i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, s17
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, s16
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, s16
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s19
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, s18
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, s21
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, s20
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, s20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, s23
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, s22
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, s25
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, s24
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, s22
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_cvt_f16_f32_e32 v40, s25
+; SI-NEXT:    v_cvt_f16_f32_e32 v55, s24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v22, s27
-; SI-NEXT:    v_cvt_f16_f32_e32 v38, s26
-; SI-NEXT:    v_cvt_f16_f32_e32 v51, s29
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, s28
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, s26
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v42, s29
+; SI-NEXT:    v_cvt_f16_f32_e32 v41, s28
 ; SI-NEXT:    v_cvt_f16_f32_e32 v30, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_cbranch_scc0 .LBB105_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v37
-; SI-NEXT:    v_or_b32_e32 v8, v36, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v14
-; SI-NEXT:    v_or_b32_e32 v12, v35, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v48
-; SI-NEXT:    v_or_b32_e32 v16, v39, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v22
-; SI-NEXT:    v_or_b32_e32 v20, v38, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v51
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v32
+; SI-NEXT:    v_or_b32_e32 v48, v16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; SI-NEXT:    v_or_b32_e32 v24, v50, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v30
-; SI-NEXT:    v_or_b32_e32 v0, v33, v0
-; SI-NEXT:    v_or_b32_e32 v4, v32, v1
-; SI-NEXT:    v_or_b32_e32 v28, v49, v5
-; SI-NEXT:    v_alignbit_b32 v3, v4, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, v4, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, v4, v0, 8
-; SI-NEXT:    v_alignbit_b32 v11, v12, v8, 24
-; SI-NEXT:    v_alignbit_b32 v10, v12, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, v12, v8, 8
-; SI-NEXT:    v_alignbit_b32 v19, v20, v16, 24
-; SI-NEXT:    v_alignbit_b32 v18, v20, v16, 16
-; SI-NEXT:    v_alignbit_b32 v17, v20, v16, 8
-; SI-NEXT:    v_alignbit_b32 v27, v28, v24, 24
-; SI-NEXT:    v_alignbit_b32 v26, v28, v24, 16
-; SI-NEXT:    v_alignbit_b32 v25, v28, v24, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 8, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v29, 8, v28
+; SI-NEXT:    v_or_b32_e32 v49, v8, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[48:49], 8
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v53
+; SI-NEXT:    v_or_b32_e32 v35, v52, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v14
+; SI-NEXT:    v_or_b32_e32 v36, v39, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
+; SI-NEXT:    v_or_b32_e32 v37, v55, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v22
+; SI-NEXT:    v_or_b32_e32 v38, v54, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v42
+; SI-NEXT:    v_lshr_b64 v[3:4], v[48:49], 24
+; SI-NEXT:    v_lshr_b64 v[11:12], v[35:36], 24
+; SI-NEXT:    v_or_b32_e32 v33, v41, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v30
+; SI-NEXT:    v_lshr_b64 v[4:5], v[48:49], 16
+; SI-NEXT:    v_lshr_b64 v[12:13], v[35:36], 16
+; SI-NEXT:    v_or_b32_e32 v34, v0, v2
+; SI-NEXT:    v_lshr_b64 v[24:25], v[37:38], 16
+; SI-NEXT:    v_lshr_b64 v[9:10], v[35:36], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v49
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v36
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 8, v38
+; SI-NEXT:    v_lshrrev_b32_e32 v29, 8, v34
 ; SI-NEXT:    v_bfe_u32 v7, v6, 8, 8
 ; SI-NEXT:    v_bfe_u32 v15, v14, 8, 8
 ; SI-NEXT:    v_bfe_u32 v23, v22, 8, 8
 ; SI-NEXT:    v_bfe_u32 v31, v30, 8, 8
+; SI-NEXT:    v_lshr_b64 v[19:20], v[37:38], 24
+; SI-NEXT:    v_lshr_b64 v[17:18], v[37:38], 8
+; SI-NEXT:    v_lshr_b64 v[27:28], v[33:34], 24
+; SI-NEXT:    v_lshr_b64 v[50:51], v[33:34], 16
+; SI-NEXT:    v_lshr_b64 v[25:26], v[33:34], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB105_3
 ; SI-NEXT:  .LBB105_2: ; %cmp.true
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v51
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v50
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v39
-; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v42
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v41
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v30
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v30, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v49
-; SI-NEXT:    v_or_b32_e32 v24, v1, v0
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v48
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v30, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v33, v2, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v40
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v55
+; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v30
-; SI-NEXT:    v_or_b32_e32 v28, v2, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v30
+; SI-NEXT:    v_or_b32_e32 v34, v0, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v16, v3, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v38
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v37
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; SI-NEXT:    v_or_b32_e32 v37, v3, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v54
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v53
 ; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v22, v0
 ; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v1
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v36
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v52
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v22
-; SI-NEXT:    v_or_b32_e32 v20, v0, v2
+; SI-NEXT:    v_or_b32_e32 v38, v0, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v14
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v35
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v39
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v1
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_or_b32_e32 v8, v2, v0
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v34
+; SI-NEXT:    v_or_b32_e32 v35, v2, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v32
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v14
-; SI-NEXT:    v_or_b32_e32 v12, v1, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v33
+; SI-NEXT:    v_or_b32_e32 v36, v1, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v32
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v8
 ; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
@@ -35503,60 +35558,72 @@ define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; SI-NEXT:    v_or_b32_e32 v4, v2, v1
-; SI-NEXT:    v_alignbit_b32 v3, v4, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, v4, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, v4, v0, 8
-; SI-NEXT:    v_alignbit_b32 v11, v12, v8, 24
-; SI-NEXT:    v_alignbit_b32 v10, v12, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, v12, v8, 8
-; SI-NEXT:    v_alignbit_b32 v19, v20, v16, 24
-; SI-NEXT:    v_alignbit_b32 v18, v20, v16, 16
-; SI-NEXT:    v_alignbit_b32 v17, v20, v16, 8
-; SI-NEXT:    v_alignbit_b32 v27, v28, v24, 24
-; SI-NEXT:    v_alignbit_b32 v26, v28, v24, 16
-; SI-NEXT:    v_alignbit_b32 v25, v28, v24, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 8, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v29, 8, v28
+; SI-NEXT:    v_or_b32_e32 v48, v1, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; SI-NEXT:    v_or_b32_e32 v49, v2, v0
+; SI-NEXT:    v_lshr_b64 v[3:4], v[48:49], 24
+; SI-NEXT:    v_lshr_b64 v[11:12], v[35:36], 24
+; SI-NEXT:    v_lshr_b64 v[24:25], v[37:38], 16
+; SI-NEXT:    v_lshr_b64 v[4:5], v[48:49], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[48:49], 8
+; SI-NEXT:    v_lshr_b64 v[12:13], v[35:36], 16
+; SI-NEXT:    v_lshr_b64 v[9:10], v[35:36], 8
+; SI-NEXT:    v_lshr_b64 v[19:20], v[37:38], 24
+; SI-NEXT:    v_lshr_b64 v[17:18], v[37:38], 8
+; SI-NEXT:    v_lshr_b64 v[27:28], v[33:34], 24
+; SI-NEXT:    v_lshr_b64 v[50:51], v[33:34], 16
+; SI-NEXT:    v_lshr_b64 v[25:26], v[33:34], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v49
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v36
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 8, v38
+; SI-NEXT:    v_lshrrev_b32_e32 v29, 8, v34
 ; SI-NEXT:    v_bfe_u32 v7, v6, 8, 8
 ; SI-NEXT:    v_bfe_u32 v15, v14, 8, 8
 ; SI-NEXT:    v_bfe_u32 v23, v22, 8, 8
 ; SI-NEXT:    v_bfe_u32 v31, v30, 8, 8
 ; SI-NEXT:  .LBB105_3: ; %end
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v0, v48
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:    v_mov_b32_e32 v4, v49
+; SI-NEXT:    v_mov_b32_e32 v8, v35
+; SI-NEXT:    v_mov_b32_e32 v10, v12
+; SI-NEXT:    v_mov_b32_e32 v12, v36
+; SI-NEXT:    v_mov_b32_e32 v16, v37
+; SI-NEXT:    v_mov_b32_e32 v18, v24
+; SI-NEXT:    v_mov_b32_e32 v20, v38
+; SI-NEXT:    v_mov_b32_e32 v24, v33
+; SI-NEXT:    v_mov_b32_e32 v26, v50
+; SI-NEXT:    v_mov_b32_e32 v28, v34
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB105_4:
-; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr8
+; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr12
 ; SI-NEXT:    ; implicit-def: $vgpr13
 ; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr20
 ; SI-NEXT:    ; implicit-def: $vgpr21
 ; SI-NEXT:    ; implicit-def: $vgpr23
+; SI-NEXT:    ; implicit-def: $vgpr29
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr12
+; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $vgpr37
+; SI-NEXT:    ; implicit-def: $vgpr17
 ; SI-NEXT:    ; implicit-def: $vgpr24
+; SI-NEXT:    ; implicit-def: $vgpr19
+; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr26
+; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    s_branch .LBB105_2
 ;
 ; VI-LABEL: bitcast_v16f16_to_v32i8_scalar:
@@ -38793,166 +38860,186 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a,
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mul_f32_e64 v34, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v35, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v32, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v33, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v38, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v39, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v36, 1.0, s23
-; SI-NEXT:    v_mul_f32_e64 v37, 1.0, s22
-; SI-NEXT:    v_mul_f32_e64 v50, 1.0, s25
-; SI-NEXT:    v_mul_f32_e64 v51, 1.0, s24
-; SI-NEXT:    v_mul_f32_e64 v48, 1.0, s27
-; SI-NEXT:    v_mul_f32_e64 v49, 1.0, s26
-; SI-NEXT:    v_mul_f32_e64 v54, 1.0, s29
-; SI-NEXT:    v_mul_f32_e64 v55, 1.0, s28
-; SI-NEXT:    v_mul_f32_e32 v52, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v53, 1.0, v0
+; SI-NEXT:    v_mul_f32_e64 v24, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v32, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v16, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v51, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v52, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v39, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v50, 1.0, s22
+; SI-NEXT:    v_mul_f32_e64 v55, 1.0, s25
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    v_mul_f32_e64 v40, 1.0, s24
+; SI-NEXT:    v_mul_f32_e64 v53, 1.0, s27
+; SI-NEXT:    v_mul_f32_e64 v54, 1.0, s26
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mul_f32_e64 v42, 1.0, s29
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mul_f32_e64 v43, 1.0, s28
+; SI-NEXT:    v_mul_f32_e32 v41, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    s_cbranch_scc0 .LBB109_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v38
-; SI-NEXT:    v_alignbit_b32 v8, v5, v39, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v50
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v34
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v36
-; SI-NEXT:    v_alignbit_b32 v16, v5, v51, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v48
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v54
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v52
-; SI-NEXT:    v_alignbit_b32 v0, v0, v35, 16
-; SI-NEXT:    v_alignbit_b32 v4, v6, v33, 16
-; SI-NEXT:    v_alignbit_b32 v12, v14, v37, 16
-; SI-NEXT:    v_alignbit_b32 v20, v22, v49, 16
-; SI-NEXT:    v_alignbit_b32 v24, v5, v55, 16
-; SI-NEXT:    v_alignbit_b32 v28, v30, v53, 16
-; SI-NEXT:    v_alignbit_b32 v3, v4, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, v4, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, v4, v0, 8
-; SI-NEXT:    v_alignbit_b32 v11, v12, v8, 24
-; SI-NEXT:    v_alignbit_b32 v10, v12, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, v12, v8, 8
-; SI-NEXT:    v_alignbit_b32 v19, v20, v16, 24
-; SI-NEXT:    v_alignbit_b32 v18, v20, v16, 16
-; SI-NEXT:    v_alignbit_b32 v17, v20, v16, 8
-; SI-NEXT:    v_alignbit_b32 v27, v28, v24, 24
-; SI-NEXT:    v_alignbit_b32 v26, v28, v24, 16
-; SI-NEXT:    v_alignbit_b32 v25, v28, v24, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v36
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 24, v48
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 8, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 24, v52
-; SI-NEXT:    v_lshrrev_b32_e32 v29, 8, v28
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v8
+; SI-NEXT:    v_alignbit_b32 v48, v1, v32, 16
+; SI-NEXT:    v_alignbit_b32 v49, v6, v16, 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[48:49], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v51
+; SI-NEXT:    v_alignbit_b32 v37, v2, v52, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v55
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v39
+; SI-NEXT:    v_alignbit_b32 v35, v2, v40, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v53
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v42
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v41
+; SI-NEXT:    v_alignbit_b32 v38, v14, v50, 16
+; SI-NEXT:    v_alignbit_b32 v36, v22, v54, 16
+; SI-NEXT:    v_alignbit_b32 v33, v2, v43, 16
+; SI-NEXT:    v_alignbit_b32 v34, v30, v0, 16
+; SI-NEXT:    v_lshr_b64 v[3:4], v[48:49], 24
+; SI-NEXT:    v_lshr_b64 v[11:12], v[37:38], 24
+; SI-NEXT:    v_lshr_b64 v[19:20], v[35:36], 24
+; SI-NEXT:    v_lshr_b64 v[27:28], v[33:34], 24
+; SI-NEXT:    v_lshr_b64 v[4:5], v[48:49], 16
+; SI-NEXT:    v_lshr_b64 v[12:13], v[37:38], 16
+; SI-NEXT:    v_lshr_b64 v[20:21], v[35:36], 16
+; SI-NEXT:    v_lshr_b64 v[28:29], v[33:34], 16
+; SI-NEXT:    v_lshr_b64 v[9:10], v[37:38], 8
+; SI-NEXT:    v_lshr_b64 v[17:18], v[35:36], 8
+; SI-NEXT:    v_lshr_b64 v[25:26], v[33:34], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v49
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v39
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v38
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 24, v53
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 8, v36
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 24, v41
+; SI-NEXT:    v_lshrrev_b32_e32 v29, 8, v34
 ; SI-NEXT:    s_cbranch_execnz .LBB109_3
 ; SI-NEXT:  .LBB109_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v54
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v55
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v42
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v43
+; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_alignbit_b32 v24, v1, v0, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v52
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v53
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_alignbit_b32 v33, v2, v1, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v41
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v31
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v50
-; SI-NEXT:    v_alignbit_b32 v28, v30, v0, 16
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v51
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v55
+; SI-NEXT:    v_alignbit_b32 v34, v30, v0, 16
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v40
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_alignbit_b32 v16, v1, v0, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v48
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v49
+; SI-NEXT:    v_alignbit_b32 v35, v1, v0, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v53
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v54
 ; SI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v23
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v38
-; SI-NEXT:    v_alignbit_b32 v20, v22, v0, 16
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v39
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v51
+; SI-NEXT:    v_alignbit_b32 v36, v22, v0, 16
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v52
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_alignbit_b32 v8, v1, v0, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v36
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v37
+; SI-NEXT:    v_alignbit_b32 v37, v1, v0, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v39
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v50
 ; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v15
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v34
-; SI-NEXT:    v_alignbit_b32 v12, v14, v0, 16
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v35
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v24
+; SI-NEXT:    v_alignbit_b32 v38, v14, v0, 16
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v32
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v32
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v33
-; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v2
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_alignbit_b32 v48, v1, v0, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v8
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v16
+; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v1
+; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
-; SI-NEXT:    v_alignbit_b32 v4, v6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v3, v4, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, v4, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, v4, v0, 8
-; SI-NEXT:    v_alignbit_b32 v11, v12, v8, 24
-; SI-NEXT:    v_alignbit_b32 v10, v12, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, v12, v8, 8
-; SI-NEXT:    v_alignbit_b32 v19, v20, v16, 24
-; SI-NEXT:    v_alignbit_b32 v18, v20, v16, 16
-; SI-NEXT:    v_alignbit_b32 v17, v20, v16, 8
-; SI-NEXT:    v_alignbit_b32 v27, v28, v24, 24
-; SI-NEXT:    v_alignbit_b32 v26, v28, v24, 16
-; SI-NEXT:    v_alignbit_b32 v25, v28, v24, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 8, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v29, 8, v28
+; SI-NEXT:    v_alignbit_b32 v49, v6, v0, 16
+; SI-NEXT:    v_lshr_b64 v[3:4], v[48:49], 24
+; SI-NEXT:    v_lshr_b64 v[11:12], v[37:38], 24
+; SI-NEXT:    v_lshr_b64 v[19:20], v[35:36], 24
+; SI-NEXT:    v_lshr_b64 v[27:28], v[33:34], 24
+; SI-NEXT:    v_lshr_b64 v[4:5], v[48:49], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[48:49], 8
+; SI-NEXT:    v_lshr_b64 v[12:13], v[37:38], 16
+; SI-NEXT:    v_lshr_b64 v[9:10], v[37:38], 8
+; SI-NEXT:    v_lshr_b64 v[20:21], v[35:36], 16
+; SI-NEXT:    v_lshr_b64 v[17:18], v[35:36], 8
+; SI-NEXT:    v_lshr_b64 v[28:29], v[33:34], 16
+; SI-NEXT:    v_lshr_b64 v[25:26], v[33:34], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v49
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 8, v38
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 8, v36
+; SI-NEXT:    v_lshrrev_b32_e32 v29, 8, v34
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v7
 ; SI-NEXT:    v_lshrrev_b32_e32 v15, 24, v15
 ; SI-NEXT:    v_lshrrev_b32_e32 v23, 24, v23
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 24, v31
 ; SI-NEXT:  .LBB109_3: ; %end
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v0, v48
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:    v_mov_b32_e32 v4, v49
+; SI-NEXT:    v_mov_b32_e32 v8, v37
+; SI-NEXT:    v_mov_b32_e32 v10, v12
+; SI-NEXT:    v_mov_b32_e32 v12, v38
+; SI-NEXT:    v_mov_b32_e32 v16, v35
+; SI-NEXT:    v_mov_b32_e32 v18, v20
+; SI-NEXT:    v_mov_b32_e32 v20, v36
+; SI-NEXT:    v_mov_b32_e32 v24, v33
+; SI-NEXT:    v_mov_b32_e32 v26, v28
+; SI-NEXT:    v_mov_b32_e32 v28, v34
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB109_4:
-; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr12
 ; SI-NEXT:    ; implicit-def: $vgpr13
 ; SI-NEXT:    ; implicit-def: $vgpr14
 ; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr20
 ; SI-NEXT:    ; implicit-def: $vgpr21
 ; SI-NEXT:    ; implicit-def: $vgpr22
 ; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr28
 ; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr37
+; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $vgpr12
+; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    ; implicit-def: $vgpr17
+; SI-NEXT:    ; implicit-def: $vgpr20
+; SI-NEXT:    ; implicit-def: $vgpr19
+; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr25
+; SI-NEXT:    ; implicit-def: $vgpr28
+; SI-NEXT:    ; implicit-def: $vgpr27
 ; SI-NEXT:    s_branch .LBB109_2
 ;
 ; VI-LABEL: bitcast_v16bf16_to_v32i8_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll
index 6cf53d187fcab..57de868ad37b3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll
@@ -647,70 +647,65 @@ define inreg <18 x i16> @bitcast_v9i32_to_v18i16_scalar(<9 x i32> inreg %a, i32
 ; SI-NEXT:    s_cmp_lg_u32 s25, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB5_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s24
-; SI-NEXT:    v_alignbit_b32 v17, s4, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v13, s23, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s23, 16
-; SI-NEXT:    s_lshr_b32 s7, s21, 16
-; SI-NEXT:    s_lshr_b32 s8, s19, 16
-; SI-NEXT:    s_lshr_b32 s9, s17, 16
+; SI-NEXT:    s_lshr_b32 s25, s23, 16
+; SI-NEXT:    s_lshr_b32 s26, s21, 16
+; SI-NEXT:    s_lshr_b32 s27, s19, 16
+; SI-NEXT:    s_lshr_b32 s28, s17, 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB5_3
 ; SI-NEXT:  .LBB5_2: ; %cmp.true
-; SI-NEXT:    s_add_i32 s22, s22, 3
-; SI-NEXT:    s_add_i32 s20, s20, 3
 ; SI-NEXT:    s_add_i32 s23, s23, 3
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    s_add_i32 s18, s18, 3
-; SI-NEXT:    s_add_i32 s21, s21, 3
-; SI-NEXT:    v_alignbit_b32 v13, s23, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    s_add_i32 s16, s16, 3
-; SI-NEXT:    s_add_i32 s19, s19, 3
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
 ; SI-NEXT:    s_add_i32 s24, s24, 3
 ; SI-NEXT:    s_add_i32 s17, s17, 3
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s24
-; SI-NEXT:    v_alignbit_b32 v17, s4, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s23, 16
-; SI-NEXT:    s_lshr_b32 s7, s21, 16
-; SI-NEXT:    s_lshr_b32 s8, s19, 16
-; SI-NEXT:    s_lshr_b32 s9, s17, 16
+; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    s_add_i32 s19, s19, 3
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_add_i32 s21, s21, 3
+; SI-NEXT:    s_add_i32 s20, s20, 3
+; SI-NEXT:    s_add_i32 s22, s22, 3
+; SI-NEXT:    s_lshr_b32 s25, s23, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[16:17], 16
+; SI-NEXT:    s_lshr_b32 s26, s21, 16
+; SI-NEXT:    s_lshr_b32 s27, s19, 16
+; SI-NEXT:    s_lshr_b32 s28, s17, 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[24:25], 16
 ; SI-NEXT:  .LBB5_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
+; SI-NEXT:    v_mov_b32_e32 v1, s12
 ; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v3, s9
+; SI-NEXT:    v_mov_b32_e32 v3, s28
 ; SI-NEXT:    v_mov_b32_e32 v4, s18
+; SI-NEXT:    v_mov_b32_e32 v5, s10
 ; SI-NEXT:    v_mov_b32_e32 v6, s19
-; SI-NEXT:    v_mov_b32_e32 v7, s8
+; SI-NEXT:    v_mov_b32_e32 v7, s27
 ; SI-NEXT:    v_mov_b32_e32 v8, s20
+; SI-NEXT:    v_mov_b32_e32 v9, s6
 ; SI-NEXT:    v_mov_b32_e32 v10, s21
-; SI-NEXT:    v_mov_b32_e32 v11, s7
+; SI-NEXT:    v_mov_b32_e32 v11, s26
 ; SI-NEXT:    v_mov_b32_e32 v12, s22
+; SI-NEXT:    v_mov_b32_e32 v13, s4
 ; SI-NEXT:    v_mov_b32_e32 v14, s23
-; SI-NEXT:    v_mov_b32_e32 v15, s6
+; SI-NEXT:    v_mov_b32_e32 v15, s25
 ; SI-NEXT:    v_mov_b32_e32 v16, s24
+; SI-NEXT:    v_mov_b32_e32 v17, s8
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB5_4:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr28
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr27
 ; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr17
+; SI-NEXT:    ; implicit-def: $sgpr26
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr25
+; SI-NEXT:    ; implicit-def: $sgpr8
 ; SI-NEXT:    s_branch .LBB5_2
 ;
 ; VI-LABEL: bitcast_v9i32_to_v18i16_scalar:
@@ -2361,66 +2356,75 @@ define inreg <18 x i16> @bitcast_v9f32_to_v18i16_scalar(<9 x float> inreg %a, i3
 ; SI-NEXT:    s_cmp_lg_u32 s25, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s24
-; SI-NEXT:    v_alignbit_b32 v17, s4, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v13, s23, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s9, s23, 16
-; SI-NEXT:    s_lshr_b32 s8, s21, 16
-; SI-NEXT:    s_lshr_b32 s7, s19, 16
-; SI-NEXT:    s_lshr_b32 s6, s17, 16
+; SI-NEXT:    s_lshr_b32 s25, s17, 16
+; SI-NEXT:    s_lshr_b32 s28, s23, 16
+; SI-NEXT:    s_lshr_b32 s27, s21, 16
+; SI-NEXT:    s_lshr_b32 s26, s19, 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB13_4
 ; SI-NEXT:  .LBB13_2: ; %cmp.true
 ; SI-NEXT:    v_add_f32_e64 v16, s24, 1.0
-; SI-NEXT:    v_add_f32_e64 v2, s17, 1.0
-; SI-NEXT:    v_add_f32_e64 v0, s16, 1.0
-; SI-NEXT:    v_add_f32_e64 v6, s19, 1.0
-; SI-NEXT:    v_add_f32_e64 v4, s18, 1.0
-; SI-NEXT:    v_add_f32_e64 v10, s21, 1.0
-; SI-NEXT:    v_add_f32_e64 v8, s20, 1.0
-; SI-NEXT:    v_add_f32_e64 v14, s23, 1.0
-; SI-NEXT:    v_add_f32_e64 v12, s22, 1.0
-; SI-NEXT:    v_alignbit_b32 v13, v14, v12, 16
-; SI-NEXT:    v_alignbit_b32 v9, v10, v8, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v4, 16
-; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; SI-NEXT:    v_alignbit_b32 v17, s4, v16, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-NEXT:    s_setpc_b64 s[30:31]
+; SI-NEXT:    v_add_f32_e64 v26, s17, 1.0
+; SI-NEXT:    v_add_f32_e64 v25, s16, 1.0
+; SI-NEXT:    v_add_f32_e64 v24, s19, 1.0
+; SI-NEXT:    v_add_f32_e64 v23, s18, 1.0
+; SI-NEXT:    v_add_f32_e64 v22, s21, 1.0
+; SI-NEXT:    v_add_f32_e64 v21, s20, 1.0
+; SI-NEXT:    v_add_f32_e64 v20, s23, 1.0
+; SI-NEXT:    v_add_f32_e64 v19, s22, 1.0
+; SI-NEXT:    v_lshr_b64 v[13:14], v[19:20], 16
+; SI-NEXT:    v_lshr_b64 v[9:10], v[21:22], 16
+; SI-NEXT:    v_lshr_b64 v[5:6], v[23:24], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[25:26], 16
+; SI-NEXT:    v_lshr_b64 v[17:18], v[16:17], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v26
+; SI-NEXT:    s_branch .LBB13_5
 ; SI-NEXT:  .LBB13_3:
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr25
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr26
 ; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $sgpr27
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr28
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $vgpr17
 ; SI-NEXT:    s_branch .LBB13_2
 ; SI-NEXT:  .LBB13_4:
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v4, s18
-; SI-NEXT:    v_mov_b32_e32 v6, s19
-; SI-NEXT:    v_mov_b32_e32 v8, s20
-; SI-NEXT:    v_mov_b32_e32 v10, s21
-; SI-NEXT:    v_mov_b32_e32 v12, s22
-; SI-NEXT:    v_mov_b32_e32 v14, s23
+; SI-NEXT:    v_mov_b32_e32 v25, s16
+; SI-NEXT:    v_mov_b32_e32 v26, s17
+; SI-NEXT:    v_mov_b32_e32 v23, s18
+; SI-NEXT:    v_mov_b32_e32 v24, s19
+; SI-NEXT:    v_mov_b32_e32 v21, s20
+; SI-NEXT:    v_mov_b32_e32 v22, s21
+; SI-NEXT:    v_mov_b32_e32 v19, s22
+; SI-NEXT:    v_mov_b32_e32 v20, s23
 ; SI-NEXT:    v_mov_b32_e32 v16, s24
-; SI-NEXT:    v_mov_b32_e32 v3, s6
-; SI-NEXT:    v_mov_b32_e32 v7, s7
-; SI-NEXT:    v_mov_b32_e32 v11, s8
-; SI-NEXT:    v_mov_b32_e32 v15, s9
+; SI-NEXT:    v_mov_b32_e32 v3, s25
+; SI-NEXT:    v_mov_b32_e32 v7, s26
+; SI-NEXT:    v_mov_b32_e32 v11, s27
+; SI-NEXT:    v_mov_b32_e32 v15, s28
+; SI-NEXT:    v_mov_b32_e32 v17, s8
+; SI-NEXT:    v_mov_b32_e32 v1, s12
+; SI-NEXT:    v_mov_b32_e32 v5, s10
+; SI-NEXT:    v_mov_b32_e32 v9, s6
+; SI-NEXT:    v_mov_b32_e32 v13, s4
+; SI-NEXT:  .LBB13_5: ; %end
+; SI-NEXT:    v_mov_b32_e32 v0, v25
+; SI-NEXT:    v_mov_b32_e32 v2, v26
+; SI-NEXT:    v_mov_b32_e32 v4, v23
+; SI-NEXT:    v_mov_b32_e32 v6, v24
+; SI-NEXT:    v_mov_b32_e32 v8, v21
+; SI-NEXT:    v_mov_b32_e32 v10, v22
+; SI-NEXT:    v_mov_b32_e32 v12, v19
+; SI-NEXT:    v_mov_b32_e32 v14, v20
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v9f32_to_v18i16_scalar:
@@ -4659,112 +4663,114 @@ define inreg <18 x i16> @bitcast_v18f16_to_v18i16_scalar(<18 x half> inreg %a, i
 ; SI-LABEL: bitcast_v18f16_to_v18i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v17, v3
-; SI-NEXT:    v_mov_b32_e32 v16, v2
-; SI-NEXT:    v_mov_b32_e32 v15, v1
-; SI-NEXT:    v_mov_b32_e32 v14, v0
-; SI-NEXT:    v_mov_b32_e32 v18, v4
+; SI-NEXT:    v_mov_b32_e32 v5, v3
+; SI-NEXT:    v_mov_b32_e32 v9, v2
+; SI-NEXT:    v_mov_b32_e32 v10, v0
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, s16
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v23, s17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, s18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, s19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, s21
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s24
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, s25
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, s27
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, s28
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, s29
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v17, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v21, s25
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, s26
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, s29
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_cbranch_scc0 .LBB23_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_cbranch_execnz .LBB23_3
 ; SI-NEXT:  .LBB23_2: ; %cmp.true
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
 ; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v18
 ; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v15
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v23
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v21
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
 ; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
 ; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
 ; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
 ; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
-; SI-NEXT:    v_or_b32_e32 v16, v16, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v15
+; SI-NEXT:    v_or_b32_e32 v14, v14, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
+; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
+; SI-NEXT:    v_or_b32_e32 v10, v10, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT:    v_or_b32_e32 v14, v14, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v11
-; SI-NEXT:    v_or_b32_e32 v10, v10, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v7
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_or_b32_e32 v16, v16, v18
 ; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v18
+; SI-NEXT:    v_lshr_b64 v[18:19], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[23:24], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[21:22], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[19:20], v[13:14], 16
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v5
 ; SI-NEXT:    v_or_b32_e32 v8, v8, v9
 ; SI-NEXT:    v_or_b32_e32 v12, v12, v13
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v9, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v13, v14, v13, 16
 ; SI-NEXT:  .LBB23_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v1, v23
+; SI-NEXT:    v_mov_b32_e32 v5, v18
+; SI-NEXT:    v_mov_b32_e32 v9, v21
+; SI-NEXT:    v_mov_b32_e32 v13, v19
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB23_4:
 ; SI-NEXT:    s_branch .LBB23_2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
index 2abb2f3b9de52..3aaf25423a184 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
@@ -681,76 +681,71 @@ define inreg <20 x i16> @bitcast_v10i32_to_v20i16_scalar(<10 x i32> inreg %a, i3
 ; SI-NEXT:    s_cmp_lg_u32 s26, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB5_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s24
-; SI-NEXT:    v_alignbit_b32 v17, s25, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v13, s23, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s25, 16
-; SI-NEXT:    s_lshr_b32 s7, s23, 16
-; SI-NEXT:    s_lshr_b32 s8, s21, 16
-; SI-NEXT:    s_lshr_b32 s9, s19, 16
-; SI-NEXT:    s_lshr_b32 s10, s17, 16
+; SI-NEXT:    s_lshr_b32 s26, s25, 16
+; SI-NEXT:    s_lshr_b32 s27, s23, 16
+; SI-NEXT:    s_lshr_b32 s28, s21, 16
+; SI-NEXT:    s_lshr_b32 s29, s19, 16
+; SI-NEXT:    s_lshr_b32 s40, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB5_3
 ; SI-NEXT:  .LBB5_2: ; %cmp.true
-; SI-NEXT:    s_add_i32 s24, s24, 3
-; SI-NEXT:    s_add_i32 s22, s22, 3
-; SI-NEXT:    s_add_i32 s25, s25, 3
-; SI-NEXT:    v_mov_b32_e32 v0, s24
-; SI-NEXT:    s_add_i32 s20, s20, 3
-; SI-NEXT:    s_add_i32 s23, s23, 3
-; SI-NEXT:    v_alignbit_b32 v17, s25, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    s_add_i32 s18, s18, 3
-; SI-NEXT:    s_add_i32 s21, s21, 3
-; SI-NEXT:    v_alignbit_b32 v13, s23, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
+; SI-NEXT:    s_add_i32 s17, s17, 3
 ; SI-NEXT:    s_add_i32 s16, s16, 3
 ; SI-NEXT:    s_add_i32 s19, s19, 3
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    s_add_i32 s17, s17, 3
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s25, 16
-; SI-NEXT:    s_lshr_b32 s7, s23, 16
-; SI-NEXT:    s_lshr_b32 s8, s21, 16
-; SI-NEXT:    s_lshr_b32 s9, s19, 16
-; SI-NEXT:    s_lshr_b32 s10, s17, 16
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_add_i32 s21, s21, 3
+; SI-NEXT:    s_add_i32 s20, s20, 3
+; SI-NEXT:    s_add_i32 s23, s23, 3
+; SI-NEXT:    s_add_i32 s22, s22, 3
+; SI-NEXT:    s_add_i32 s25, s25, 3
+; SI-NEXT:    s_add_i32 s24, s24, 3
+; SI-NEXT:    s_lshr_b64 s[4:5], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[16:17], 16
+; SI-NEXT:    s_lshr_b32 s26, s25, 16
+; SI-NEXT:    s_lshr_b32 s27, s23, 16
+; SI-NEXT:    s_lshr_b32 s28, s21, 16
+; SI-NEXT:    s_lshr_b32 s29, s19, 16
+; SI-NEXT:    s_lshr_b32 s40, s17, 16
 ; SI-NEXT:  .LBB5_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
+; SI-NEXT:    v_mov_b32_e32 v1, s12
 ; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v3, s10
+; SI-NEXT:    v_mov_b32_e32 v3, s40
 ; SI-NEXT:    v_mov_b32_e32 v4, s18
+; SI-NEXT:    v_mov_b32_e32 v5, s10
 ; SI-NEXT:    v_mov_b32_e32 v6, s19
-; SI-NEXT:    v_mov_b32_e32 v7, s9
+; SI-NEXT:    v_mov_b32_e32 v7, s29
 ; SI-NEXT:    v_mov_b32_e32 v8, s20
+; SI-NEXT:    v_mov_b32_e32 v9, s8
 ; SI-NEXT:    v_mov_b32_e32 v10, s21
-; SI-NEXT:    v_mov_b32_e32 v11, s8
+; SI-NEXT:    v_mov_b32_e32 v11, s28
 ; SI-NEXT:    v_mov_b32_e32 v12, s22
+; SI-NEXT:    v_mov_b32_e32 v13, s6
 ; SI-NEXT:    v_mov_b32_e32 v14, s23
-; SI-NEXT:    v_mov_b32_e32 v15, s7
+; SI-NEXT:    v_mov_b32_e32 v15, s27
 ; SI-NEXT:    v_mov_b32_e32 v16, s24
+; SI-NEXT:    v_mov_b32_e32 v17, s4
 ; SI-NEXT:    v_mov_b32_e32 v18, s25
-; SI-NEXT:    v_mov_b32_e32 v19, s6
+; SI-NEXT:    v_mov_b32_e32 v19, s26
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB5_4:
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr40
 ; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $sgpr29
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr17
+; SI-NEXT:    ; implicit-def: $sgpr28
 ; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr27
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr26
 ; SI-NEXT:    s_branch .LBB5_2
 ;
 ; VI-LABEL: bitcast_v10i32_to_v20i16_scalar:
@@ -3371,241 +3366,239 @@ define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32
 ; SI-NEXT:    s_cmp_lg_u32 s26, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v3, s24
-; SI-NEXT:    v_mov_b32_e32 v6, s22
-; SI-NEXT:    v_mov_b32_e32 v9, s20
-; SI-NEXT:    v_mov_b32_e32 v12, s18
-; SI-NEXT:    v_mov_b32_e32 v15, s16
-; SI-NEXT:    v_alignbit_b32 v1, s25, v3, 24
-; SI-NEXT:    v_alignbit_b32 v2, s25, v3, 16
-; SI-NEXT:    v_alignbit_b32 v3, s25, v3, 8
-; SI-NEXT:    v_alignbit_b32 v4, s23, v6, 24
-; SI-NEXT:    v_alignbit_b32 v5, s23, v6, 16
-; SI-NEXT:    v_alignbit_b32 v6, s23, v6, 8
-; SI-NEXT:    v_alignbit_b32 v7, s21, v9, 24
-; SI-NEXT:    v_alignbit_b32 v8, s21, v9, 16
-; SI-NEXT:    v_alignbit_b32 v9, s21, v9, 8
-; SI-NEXT:    v_alignbit_b32 v10, s19, v12, 24
-; SI-NEXT:    v_alignbit_b32 v11, s19, v12, 16
-; SI-NEXT:    v_alignbit_b32 v12, s19, v12, 8
-; SI-NEXT:    v_alignbit_b32 v13, s17, v15, 24
-; SI-NEXT:    v_alignbit_b32 v14, s17, v15, 16
-; SI-NEXT:    v_alignbit_b32 v15, s17, v15, 8
-; SI-NEXT:    s_lshr_b32 s6, s25, 24
-; SI-NEXT:    s_lshr_b32 s7, s25, 16
-; SI-NEXT:    s_lshr_b32 s8, s25, 8
-; SI-NEXT:    s_lshr_b32 s9, s23, 24
-; SI-NEXT:    s_lshr_b32 s10, s23, 16
-; SI-NEXT:    s_lshr_b32 s11, s23, 8
-; SI-NEXT:    s_lshr_b32 s12, s21, 24
-; SI-NEXT:    s_lshr_b32 s13, s21, 16
-; SI-NEXT:    s_lshr_b32 s14, s21, 8
-; SI-NEXT:    s_lshr_b32 s15, s19, 24
-; SI-NEXT:    s_lshr_b32 s26, s19, 16
-; SI-NEXT:    s_lshr_b32 s27, s19, 8
-; SI-NEXT:    s_lshr_b32 s28, s17, 24
-; SI-NEXT:    s_lshr_b32 s29, s17, 16
-; SI-NEXT:    s_lshr_b32 s40, s17, 8
+; SI-NEXT:    s_lshr_b32 s72, s25, 24
+; SI-NEXT:    s_lshr_b32 s73, s25, 16
+; SI-NEXT:    s_lshr_b32 s74, s25, 8
+; SI-NEXT:    s_lshr_b32 s75, s23, 24
+; SI-NEXT:    s_lshr_b32 s76, s23, 16
+; SI-NEXT:    s_lshr_b32 s77, s23, 8
+; SI-NEXT:    s_lshr_b32 s78, s21, 24
+; SI-NEXT:    s_lshr_b32 s79, s21, 16
+; SI-NEXT:    s_lshr_b32 s88, s21, 8
+; SI-NEXT:    s_lshr_b32 s89, s19, 24
+; SI-NEXT:    s_lshr_b32 s90, s19, 16
+; SI-NEXT:    s_lshr_b32 s91, s19, 8
+; SI-NEXT:    s_lshr_b32 s92, s17, 24
+; SI-NEXT:    s_lshr_b32 s93, s17, 16
+; SI-NEXT:    s_lshr_b32 s94, s17, 8
+; SI-NEXT:    s_lshr_b64 s[4:5], s[24:25], 24
+; SI-NEXT:    s_lshr_b64 s[6:7], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[24:25], 8
+; SI-NEXT:    s_lshr_b64 s[10:11], s[22:23], 24
+; SI-NEXT:    s_lshr_b64 s[12:13], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[22:23], 8
+; SI-NEXT:    s_lshr_b64 s[26:27], s[20:21], 24
+; SI-NEXT:    s_lshr_b64 s[28:29], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[20:21], 8
+; SI-NEXT:    s_lshr_b64 s[42:43], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[44:45], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[56:57], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[58:59], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[16:17], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB13_3
 ; SI-NEXT:  .LBB13_2: ; %cmp.true
-; SI-NEXT:    s_add_i32 s16, s16, 3
-; SI-NEXT:    s_add_i32 s18, s18, 3
-; SI-NEXT:    s_add_i32 s20, s20, 3
-; SI-NEXT:    s_add_i32 s22, s22, 3
-; SI-NEXT:    s_add_i32 s24, s24, 3
 ; SI-NEXT:    s_add_i32 s17, s17, 3
+; SI-NEXT:    s_add_i32 s16, s16, 3
 ; SI-NEXT:    s_add_i32 s19, s19, 3
+; SI-NEXT:    s_add_i32 s18, s18, 3
 ; SI-NEXT:    s_add_i32 s21, s21, 3
+; SI-NEXT:    s_add_i32 s20, s20, 3
 ; SI-NEXT:    s_add_i32 s23, s23, 3
+; SI-NEXT:    s_add_i32 s22, s22, 3
 ; SI-NEXT:    s_add_i32 s25, s25, 3
-; SI-NEXT:    v_mov_b32_e32 v3, s24
-; SI-NEXT:    v_mov_b32_e32 v6, s22
-; SI-NEXT:    v_mov_b32_e32 v9, s20
-; SI-NEXT:    v_mov_b32_e32 v12, s18
-; SI-NEXT:    v_mov_b32_e32 v15, s16
-; SI-NEXT:    v_alignbit_b32 v1, s25, v3, 24
-; SI-NEXT:    v_alignbit_b32 v2, s25, v3, 16
-; SI-NEXT:    v_alignbit_b32 v3, s25, v3, 8
-; SI-NEXT:    v_alignbit_b32 v4, s23, v6, 24
-; SI-NEXT:    v_alignbit_b32 v5, s23, v6, 16
-; SI-NEXT:    v_alignbit_b32 v6, s23, v6, 8
-; SI-NEXT:    v_alignbit_b32 v7, s21, v9, 24
-; SI-NEXT:    v_alignbit_b32 v8, s21, v9, 16
-; SI-NEXT:    v_alignbit_b32 v9, s21, v9, 8
-; SI-NEXT:    v_alignbit_b32 v10, s19, v12, 24
-; SI-NEXT:    v_alignbit_b32 v11, s19, v12, 16
-; SI-NEXT:    v_alignbit_b32 v12, s19, v12, 8
-; SI-NEXT:    v_alignbit_b32 v13, s17, v15, 24
-; SI-NEXT:    v_alignbit_b32 v14, s17, v15, 16
-; SI-NEXT:    v_alignbit_b32 v15, s17, v15, 8
-; SI-NEXT:    s_lshr_b32 s6, s25, 24
-; SI-NEXT:    s_lshr_b32 s7, s25, 16
-; SI-NEXT:    s_lshr_b32 s8, s25, 8
-; SI-NEXT:    s_lshr_b32 s9, s23, 24
-; SI-NEXT:    s_lshr_b32 s10, s23, 16
-; SI-NEXT:    s_lshr_b32 s11, s23, 8
-; SI-NEXT:    s_lshr_b32 s12, s21, 24
-; SI-NEXT:    s_lshr_b32 s13, s21, 16
-; SI-NEXT:    s_lshr_b32 s14, s21, 8
-; SI-NEXT:    s_lshr_b32 s15, s19, 24
-; SI-NEXT:    s_lshr_b32 s26, s19, 16
-; SI-NEXT:    s_lshr_b32 s27, s19, 8
-; SI-NEXT:    s_lshr_b32 s28, s17, 24
-; SI-NEXT:    s_lshr_b32 s29, s17, 16
-; SI-NEXT:    s_lshr_b32 s40, s17, 8
+; SI-NEXT:    s_add_i32 s24, s24, 3
+; SI-NEXT:    s_lshr_b64 s[4:5], s[24:25], 24
+; SI-NEXT:    s_lshr_b64 s[6:7], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[24:25], 8
+; SI-NEXT:    s_lshr_b64 s[10:11], s[22:23], 24
+; SI-NEXT:    s_lshr_b64 s[12:13], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[22:23], 8
+; SI-NEXT:    s_lshr_b64 s[26:27], s[20:21], 24
+; SI-NEXT:    s_lshr_b64 s[28:29], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[20:21], 8
+; SI-NEXT:    s_lshr_b64 s[42:43], s[18:19], 24
+; SI-NEXT:    s_lshr_b32 s72, s25, 24
+; SI-NEXT:    s_lshr_b32 s73, s25, 16
+; SI-NEXT:    s_lshr_b32 s74, s25, 8
+; SI-NEXT:    s_lshr_b32 s75, s23, 24
+; SI-NEXT:    s_lshr_b32 s76, s23, 16
+; SI-NEXT:    s_lshr_b32 s77, s23, 8
+; SI-NEXT:    s_lshr_b32 s78, s21, 24
+; SI-NEXT:    s_lshr_b32 s79, s21, 16
+; SI-NEXT:    s_lshr_b32 s88, s21, 8
+; SI-NEXT:    s_lshr_b32 s89, s19, 24
+; SI-NEXT:    s_lshr_b32 s90, s19, 16
+; SI-NEXT:    s_lshr_b32 s91, s19, 8
+; SI-NEXT:    s_lshr_b32 s92, s17, 24
+; SI-NEXT:    s_lshr_b32 s93, s17, 16
+; SI-NEXT:    s_lshr_b32 s94, s17, 8
+; SI-NEXT:    s_lshr_b64 s[44:45], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[56:57], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[58:59], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[16:17], 8
 ; SI-NEXT:  .LBB13_3: ; %end
-; SI-NEXT:    s_and_b32 s4, s16, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
-; SI-NEXT:    v_or_b32_e32 v15, s4, v15
-; SI-NEXT:    s_and_b32 s4, s17, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s40, 8
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s29, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s28, 24
-; SI-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_or_b32_e32 v13, v13, v14
-; SI-NEXT:    v_mov_b32_e32 v14, s4
-; SI-NEXT:    s_and_b32 s4, s18, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
-; SI-NEXT:    v_or_b32_e32 v12, s4, v12
-; SI-NEXT:    s_and_b32 s4, s19, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s27, 8
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s26, 0xff
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s15, s15, 24
-; SI-NEXT:    v_or_b32_e32 v13, v15, v13
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; SI-NEXT:    v_or_b32_e32 v10, v10, v11
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s15, s5
-; SI-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen
+; SI-NEXT:    s_lshl_b32 s5, s60, 8
+; SI-NEXT:    s_and_b32 s7, s16, 0xff
+; SI-NEXT:    s_or_b32 s5, s7, s5
+; SI-NEXT:    s_and_b32 s7, s58, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s56, 24
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_or_b32 s7, s9, s7
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    s_and_b32 s5, s17, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s94, 8
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    s_and_b32 s7, s93, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_lshl_b32 s9, s92, 24
+; SI-NEXT:    s_or_b32 s7, s9, s7
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    s_lshl_b32 s5, s46, 8
+; SI-NEXT:    s_and_b32 s7, s18, 0xff
+; SI-NEXT:    s_or_b32 s5, s7, s5
+; SI-NEXT:    s_and_b32 s7, s44, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_lshl_b32 s9, s42, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s7, s9, s7
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 4, v0
-; SI-NEXT:    v_or_b32_e32 v10, v12, v10
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 8, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v14, v13, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v11, s4
-; SI-NEXT:    s_and_b32 s4, s20, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; SI-NEXT:    v_or_b32_e32 v9, s4, v9
-; SI-NEXT:    s_and_b32 s4, s21, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s14, 8
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s13, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s12, s12, 24
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s12, s5
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 12, v0
-; SI-NEXT:    v_or_b32_e32 v7, v9, v7
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 16, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v11, v10, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v8, s4
-; SI-NEXT:    s_and_b32 s4, s22, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
-; SI-NEXT:    v_or_b32_e32 v6, s4, v6
-; SI-NEXT:    s_and_b32 s4, s23, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s11, 8
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s10, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s9, s9, 24
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; SI-NEXT:    v_or_b32_e32 v4, v4, v5
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s9, s5
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    s_and_b32 s5, s19, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s91, 8
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    s_and_b32 s7, s90, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_lshl_b32 s9, s89, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s7, s9, s7
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 20, v0
-; SI-NEXT:    v_or_b32_e32 v4, v6, v4
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 24, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v8, v7, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v5, s4
-; SI-NEXT:    s_and_b32 s4, s24, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    s_and_b32 s5, s20, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s40, 8
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    s_and_b32 s7, s28, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_lshl_b32 s9, s26, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s7, s9, s7
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    s_and_b32 s5, s21, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s88, 8
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    s_and_b32 s7, s79, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_lshl_b32 s9, s78, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s7, s9, s7
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    s_and_b32 s5, s22, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s14, 8
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    s_and_b32 s7, s12, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_lshl_b32 s9, s10, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s7, s9, s7
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 20, v0
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    s_and_b32 s5, s23, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s77, 8
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    s_and_b32 s7, s76, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_lshl_b32 s9, s75, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s7, s9, s7
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 24, v0
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    s_and_b32 s5, s24, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s8, 8
+; SI-NEXT:    s_and_b32 s6, s6, 0xff
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    s_lshl_b32 s6, s6, 16
+; SI-NEXT:    s_lshl_b32 s4, s4, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s4
 ; SI-NEXT:    s_and_b32 s4, s25, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s8, 8
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT:    s_lshl_b32 s5, s74, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s7, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; SI-NEXT:    s_and_b32 s5, s73, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s6, s6, 24
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    s_lshl_b32 s6, s72, 24
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s6, s5
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 28, v0
-; SI-NEXT:    v_or_b32_e32 v1, v3, v1
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 32, v0
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB13_4:
-; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr58
+; SI-NEXT:    ; implicit-def: $sgpr56
+; SI-NEXT:    ; implicit-def: $sgpr94
+; SI-NEXT:    ; implicit-def: $sgpr93
+; SI-NEXT:    ; implicit-def: $sgpr92
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr44
+; SI-NEXT:    ; implicit-def: $sgpr42
+; SI-NEXT:    ; implicit-def: $sgpr91
+; SI-NEXT:    ; implicit-def: $sgpr90
+; SI-NEXT:    ; implicit-def: $sgpr89
 ; SI-NEXT:    ; implicit-def: $sgpr40
-; SI-NEXT:    ; implicit-def: $sgpr29
 ; SI-NEXT:    ; implicit-def: $sgpr28
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $sgpr27
 ; SI-NEXT:    ; implicit-def: $sgpr26
-; SI-NEXT:    ; implicit-def: $sgpr15
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    ; implicit-def: $sgpr79
+; SI-NEXT:    ; implicit-def: $sgpr78
 ; SI-NEXT:    ; implicit-def: $sgpr14
-; SI-NEXT:    ; implicit-def: $sgpr13
 ; SI-NEXT:    ; implicit-def: $sgpr12
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $sgpr11
 ; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr77
+; SI-NEXT:    ; implicit-def: $sgpr76
+; SI-NEXT:    ; implicit-def: $sgpr75
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $sgpr7
 ; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr74
+; SI-NEXT:    ; implicit-def: $sgpr73
+; SI-NEXT:    ; implicit-def: $sgpr72
+; SI-NEXT:    ; implicit-def: $sgpr4
 ; SI-NEXT:    s_branch .LBB13_2
 ;
 ; VI-LABEL: bitcast_v10i32_to_v40i8_scalar:
@@ -7591,72 +7584,83 @@ define inreg <20 x i16> @bitcast_v10f32_to_v20i16_scalar(<10 x float> inreg %a,
 ; SI-NEXT:    s_cmp_lg_u32 s26, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB25_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s24
-; SI-NEXT:    v_alignbit_b32 v17, s25, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v13, s23, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s10, s25, 16
-; SI-NEXT:    s_lshr_b32 s9, s23, 16
-; SI-NEXT:    s_lshr_b32 s8, s21, 16
-; SI-NEXT:    s_lshr_b32 s7, s19, 16
-; SI-NEXT:    s_lshr_b32 s6, s17, 16
+; SI-NEXT:    s_lshr_b32 s40, s25, 16
+; SI-NEXT:    s_lshr_b32 s29, s23, 16
+; SI-NEXT:    s_lshr_b32 s28, s21, 16
+; SI-NEXT:    s_lshr_b32 s27, s19, 16
+; SI-NEXT:    s_lshr_b32 s26, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB25_4
 ; SI-NEXT:  .LBB25_2: ; %cmp.true
-; SI-NEXT:    v_add_f32_e64 v2, s17, 1.0
-; SI-NEXT:    v_add_f32_e64 v0, s16, 1.0
-; SI-NEXT:    v_add_f32_e64 v6, s19, 1.0
-; SI-NEXT:    v_add_f32_e64 v4, s18, 1.0
-; SI-NEXT:    v_add_f32_e64 v10, s21, 1.0
-; SI-NEXT:    v_add_f32_e64 v8, s20, 1.0
-; SI-NEXT:    v_add_f32_e64 v14, s23, 1.0
-; SI-NEXT:    v_add_f32_e64 v12, s22, 1.0
-; SI-NEXT:    v_add_f32_e64 v18, s25, 1.0
-; SI-NEXT:    v_add_f32_e64 v16, s24, 1.0
-; SI-NEXT:    v_alignbit_b32 v17, v18, v16, 16
-; SI-NEXT:    v_alignbit_b32 v13, v14, v12, 16
-; SI-NEXT:    v_alignbit_b32 v9, v10, v8, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v4, 16
-; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-NEXT:    s_setpc_b64 s[30:31]
+; SI-NEXT:    v_add_f32_e64 v29, s17, 1.0
+; SI-NEXT:    v_add_f32_e64 v28, s16, 1.0
+; SI-NEXT:    v_add_f32_e64 v27, s19, 1.0
+; SI-NEXT:    v_add_f32_e64 v26, s18, 1.0
+; SI-NEXT:    v_add_f32_e64 v25, s21, 1.0
+; SI-NEXT:    v_add_f32_e64 v24, s20, 1.0
+; SI-NEXT:    v_add_f32_e64 v23, s23, 1.0
+; SI-NEXT:    v_add_f32_e64 v22, s22, 1.0
+; SI-NEXT:    v_add_f32_e64 v21, s25, 1.0
+; SI-NEXT:    v_add_f32_e64 v20, s24, 1.0
+; SI-NEXT:    v_lshr_b64 v[17:18], v[20:21], 16
+; SI-NEXT:    v_lshr_b64 v[13:14], v[22:23], 16
+; SI-NEXT:    v_lshr_b64 v[9:10], v[24:25], 16
+; SI-NEXT:    v_lshr_b64 v[5:6], v[26:27], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[28:29], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v25
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v27
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v29
+; SI-NEXT:    s_branch .LBB25_5
 ; SI-NEXT:  .LBB25_3:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $vgpr17
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr26
 ; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr27
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr28
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr29
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr40
 ; SI-NEXT:    s_branch .LBB25_2
 ; SI-NEXT:  .LBB25_4:
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v4, s18
-; SI-NEXT:    v_mov_b32_e32 v6, s19
-; SI-NEXT:    v_mov_b32_e32 v8, s20
-; SI-NEXT:    v_mov_b32_e32 v10, s21
-; SI-NEXT:    v_mov_b32_e32 v12, s22
-; SI-NEXT:    v_mov_b32_e32 v14, s23
-; SI-NEXT:    v_mov_b32_e32 v16, s24
-; SI-NEXT:    v_mov_b32_e32 v18, s25
-; SI-NEXT:    v_mov_b32_e32 v3, s6
-; SI-NEXT:    v_mov_b32_e32 v7, s7
-; SI-NEXT:    v_mov_b32_e32 v11, s8
-; SI-NEXT:    v_mov_b32_e32 v15, s9
-; SI-NEXT:    v_mov_b32_e32 v19, s10
+; SI-NEXT:    v_mov_b32_e32 v28, s16
+; SI-NEXT:    v_mov_b32_e32 v29, s17
+; SI-NEXT:    v_mov_b32_e32 v26, s18
+; SI-NEXT:    v_mov_b32_e32 v27, s19
+; SI-NEXT:    v_mov_b32_e32 v24, s20
+; SI-NEXT:    v_mov_b32_e32 v25, s21
+; SI-NEXT:    v_mov_b32_e32 v22, s22
+; SI-NEXT:    v_mov_b32_e32 v23, s23
+; SI-NEXT:    v_mov_b32_e32 v20, s24
+; SI-NEXT:    v_mov_b32_e32 v21, s25
+; SI-NEXT:    v_mov_b32_e32 v3, s26
+; SI-NEXT:    v_mov_b32_e32 v7, s27
+; SI-NEXT:    v_mov_b32_e32 v11, s28
+; SI-NEXT:    v_mov_b32_e32 v15, s29
+; SI-NEXT:    v_mov_b32_e32 v19, s40
+; SI-NEXT:    v_mov_b32_e32 v1, s12
+; SI-NEXT:    v_mov_b32_e32 v5, s10
+; SI-NEXT:    v_mov_b32_e32 v9, s8
+; SI-NEXT:    v_mov_b32_e32 v13, s6
+; SI-NEXT:    v_mov_b32_e32 v17, s4
+; SI-NEXT:  .LBB25_5: ; %end
+; SI-NEXT:    v_mov_b32_e32 v0, v28
+; SI-NEXT:    v_mov_b32_e32 v2, v29
+; SI-NEXT:    v_mov_b32_e32 v4, v26
+; SI-NEXT:    v_mov_b32_e32 v6, v27
+; SI-NEXT:    v_mov_b32_e32 v8, v24
+; SI-NEXT:    v_mov_b32_e32 v10, v25
+; SI-NEXT:    v_mov_b32_e32 v12, v22
+; SI-NEXT:    v_mov_b32_e32 v14, v23
+; SI-NEXT:    v_mov_b32_e32 v16, v20
+; SI-NEXT:    v_mov_b32_e32 v18, v21
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v10f32_to_v20i16_scalar:
@@ -10305,256 +10309,261 @@ define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i3
 ; SI-NEXT:    s_cmp_lg_u32 s26, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB33_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v3, s24
-; SI-NEXT:    v_mov_b32_e32 v6, s22
-; SI-NEXT:    v_mov_b32_e32 v9, s20
-; SI-NEXT:    v_mov_b32_e32 v12, s18
-; SI-NEXT:    v_mov_b32_e32 v15, s16
-; SI-NEXT:    v_alignbit_b32 v1, s25, v3, 24
-; SI-NEXT:    v_alignbit_b32 v2, s25, v3, 16
-; SI-NEXT:    v_alignbit_b32 v3, s25, v3, 8
-; SI-NEXT:    v_alignbit_b32 v4, s23, v6, 24
-; SI-NEXT:    v_alignbit_b32 v5, s23, v6, 16
-; SI-NEXT:    v_alignbit_b32 v6, s23, v6, 8
-; SI-NEXT:    v_alignbit_b32 v7, s21, v9, 24
-; SI-NEXT:    v_alignbit_b32 v8, s21, v9, 16
-; SI-NEXT:    v_alignbit_b32 v9, s21, v9, 8
-; SI-NEXT:    v_alignbit_b32 v10, s19, v12, 24
-; SI-NEXT:    v_alignbit_b32 v11, s19, v12, 16
-; SI-NEXT:    v_alignbit_b32 v12, s19, v12, 8
-; SI-NEXT:    v_alignbit_b32 v13, s17, v15, 24
-; SI-NEXT:    v_alignbit_b32 v14, s17, v15, 16
-; SI-NEXT:    v_alignbit_b32 v15, s17, v15, 8
-; SI-NEXT:    s_lshr_b32 s28, s25, 24
-; SI-NEXT:    s_lshr_b32 s29, s25, 16
-; SI-NEXT:    s_lshr_b32 s40, s25, 8
-; SI-NEXT:    s_lshr_b32 s15, s23, 24
-; SI-NEXT:    s_lshr_b32 s26, s23, 16
-; SI-NEXT:    s_lshr_b32 s27, s23, 8
-; SI-NEXT:    s_lshr_b32 s12, s21, 24
-; SI-NEXT:    s_lshr_b32 s13, s21, 16
-; SI-NEXT:    s_lshr_b32 s14, s21, 8
-; SI-NEXT:    s_lshr_b32 s9, s19, 24
-; SI-NEXT:    s_lshr_b32 s10, s19, 16
-; SI-NEXT:    s_lshr_b32 s11, s19, 8
-; SI-NEXT:    s_lshr_b32 s6, s17, 24
-; SI-NEXT:    s_lshr_b32 s7, s17, 16
-; SI-NEXT:    s_lshr_b32 s8, s17, 8
+; SI-NEXT:    s_lshr_b32 s91, s25, 24
+; SI-NEXT:    s_lshr_b32 s93, s25, 16
+; SI-NEXT:    s_lshr_b32 s94, s25, 8
+; SI-NEXT:    s_lshr_b32 s88, s23, 24
+; SI-NEXT:    s_lshr_b32 s90, s23, 16
+; SI-NEXT:    s_lshr_b32 s92, s23, 8
+; SI-NEXT:    s_lshr_b32 s77, s21, 24
+; SI-NEXT:    s_lshr_b32 s79, s21, 16
+; SI-NEXT:    s_lshr_b32 s89, s21, 8
+; SI-NEXT:    s_lshr_b32 s74, s19, 24
+; SI-NEXT:    s_lshr_b32 s76, s19, 16
+; SI-NEXT:    s_lshr_b32 s78, s19, 8
+; SI-NEXT:    s_lshr_b32 s72, s17, 24
+; SI-NEXT:    s_lshr_b32 s73, s17, 16
+; SI-NEXT:    s_lshr_b32 s75, s17, 8
+; SI-NEXT:    s_lshr_b64 s[4:5], s[24:25], 24
+; SI-NEXT:    s_lshr_b64 s[6:7], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[24:25], 8
+; SI-NEXT:    s_lshr_b64 s[10:11], s[22:23], 24
+; SI-NEXT:    s_lshr_b64 s[12:13], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[22:23], 8
+; SI-NEXT:    s_lshr_b64 s[26:27], s[20:21], 24
+; SI-NEXT:    s_lshr_b64 s[28:29], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[20:21], 8
+; SI-NEXT:    s_lshr_b64 s[42:43], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[44:45], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[56:57], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[58:59], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[16:17], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB33_4
 ; SI-NEXT:  .LBB33_2: ; %cmp.true
-; SI-NEXT:    v_add_f32_e64 v31, s17, 1.0
-; SI-NEXT:    v_add_f32_e64 v34, s16, 1.0
-; SI-NEXT:    v_add_f32_e64 v28, s19, 1.0
-; SI-NEXT:    v_add_f32_e64 v29, s18, 1.0
-; SI-NEXT:    v_add_f32_e64 v23, s21, 1.0
-; SI-NEXT:    v_add_f32_e64 v24, s20, 1.0
-; SI-NEXT:    v_add_f32_e64 v18, s23, 1.0
-; SI-NEXT:    v_add_f32_e64 v21, s22, 1.0
-; SI-NEXT:    v_add_f32_e64 v16, s25, 1.0
-; SI-NEXT:    v_add_f32_e64 v17, s24, 1.0
-; SI-NEXT:    v_alignbit_b32 v1, v16, v17, 24
-; SI-NEXT:    v_alignbit_b32 v2, v16, v17, 16
-; SI-NEXT:    v_alignbit_b32 v3, v16, v17, 8
-; SI-NEXT:    v_alignbit_b32 v4, v18, v21, 24
-; SI-NEXT:    v_alignbit_b32 v5, v18, v21, 16
-; SI-NEXT:    v_alignbit_b32 v6, v18, v21, 8
-; SI-NEXT:    v_alignbit_b32 v7, v23, v24, 24
-; SI-NEXT:    v_alignbit_b32 v8, v23, v24, 16
-; SI-NEXT:    v_alignbit_b32 v9, v23, v24, 8
-; SI-NEXT:    v_alignbit_b32 v10, v28, v29, 24
-; SI-NEXT:    v_alignbit_b32 v11, v28, v29, 16
-; SI-NEXT:    v_alignbit_b32 v12, v28, v29, 8
-; SI-NEXT:    v_alignbit_b32 v13, v31, v34, 24
-; SI-NEXT:    v_alignbit_b32 v14, v31, v34, 16
-; SI-NEXT:    v_alignbit_b32 v15, v31, v34, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 24, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 8, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 24, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 8, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 24, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 8, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 24, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v37, 8, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 24, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 8, v31
+; SI-NEXT:    v_add_f32_e64 v5, s21, 1.0
+; SI-NEXT:    v_add_f32_e64 v6, s20, 1.0
+; SI-NEXT:    v_add_f32_e64 v12, s17, 1.0
+; SI-NEXT:    v_add_f32_e64 v13, s16, 1.0
+; SI-NEXT:    v_add_f32_e64 v7, s19, 1.0
+; SI-NEXT:    v_add_f32_e64 v10, s18, 1.0
+; SI-NEXT:    v_readfirstlane_b32 s16, v6
+; SI-NEXT:    v_readfirstlane_b32 s17, v5
+; SI-NEXT:    v_add_f32_e64 v3, s23, 1.0
+; SI-NEXT:    v_add_f32_e64 v4, s22, 1.0
+; SI-NEXT:    v_add_f32_e64 v1, s25, 1.0
+; SI-NEXT:    v_add_f32_e64 v2, s24, 1.0
+; SI-NEXT:    s_lshr_b64 s[26:27], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[28:29], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[16:17], 8
+; SI-NEXT:    v_readfirstlane_b32 s16, v10
+; SI-NEXT:    v_readfirstlane_b32 s17, v7
+; SI-NEXT:    v_readfirstlane_b32 s8, v2
+; SI-NEXT:    v_readfirstlane_b32 s9, v1
+; SI-NEXT:    v_readfirstlane_b32 s14, v4
+; SI-NEXT:    v_readfirstlane_b32 s15, v3
+; SI-NEXT:    s_lshr_b64 s[42:43], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[44:45], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[16:17], 8
+; SI-NEXT:    v_readfirstlane_b32 s16, v13
+; SI-NEXT:    v_readfirstlane_b32 s17, v12
+; SI-NEXT:    s_lshr_b64 s[4:5], s[8:9], 24
+; SI-NEXT:    s_lshr_b64 s[6:7], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[8:9], 8
+; SI-NEXT:    s_lshr_b64 s[10:11], s[14:15], 24
+; SI-NEXT:    s_lshr_b64 s[12:13], s[14:15], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[14:15], 8
+; SI-NEXT:    s_lshr_b64 s[56:57], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[58:59], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[16:17], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 8, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 24, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 8, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 24, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 8, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v20, 24, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 8, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 24, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 8, v12
 ; SI-NEXT:    s_branch .LBB33_5
 ; SI-NEXT:  .LBB33_3:
-; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $sgpr11
-; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $sgpr14
-; SI-NEXT:    ; implicit-def: $sgpr13
-; SI-NEXT:    ; implicit-def: $sgpr12
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $sgpr27
-; SI-NEXT:    ; implicit-def: $sgpr26
-; SI-NEXT:    ; implicit-def: $sgpr15
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr58
+; SI-NEXT:    ; implicit-def: $sgpr56
+; SI-NEXT:    ; implicit-def: $sgpr75
+; SI-NEXT:    ; implicit-def: $sgpr73
+; SI-NEXT:    ; implicit-def: $sgpr72
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr44
+; SI-NEXT:    ; implicit-def: $sgpr42
+; SI-NEXT:    ; implicit-def: $sgpr78
+; SI-NEXT:    ; implicit-def: $sgpr76
+; SI-NEXT:    ; implicit-def: $sgpr74
 ; SI-NEXT:    ; implicit-def: $sgpr40
-; SI-NEXT:    ; implicit-def: $sgpr29
 ; SI-NEXT:    ; implicit-def: $sgpr28
+; SI-NEXT:    ; implicit-def: $sgpr26
+; SI-NEXT:    ; implicit-def: $sgpr89
+; SI-NEXT:    ; implicit-def: $sgpr79
+; SI-NEXT:    ; implicit-def: $sgpr77
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr92
+; SI-NEXT:    ; implicit-def: $sgpr90
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr94
+; SI-NEXT:    ; implicit-def: $sgpr93
+; SI-NEXT:    ; implicit-def: $sgpr91
+; SI-NEXT:    ; implicit-def: $sgpr4
 ; SI-NEXT:    s_branch .LBB33_2
 ; SI-NEXT:  .LBB33_4:
-; SI-NEXT:    v_mov_b32_e32 v34, s16
-; SI-NEXT:    v_mov_b32_e32 v31, s17
-; SI-NEXT:    v_mov_b32_e32 v29, s18
-; SI-NEXT:    v_mov_b32_e32 v28, s19
-; SI-NEXT:    v_mov_b32_e32 v24, s20
-; SI-NEXT:    v_mov_b32_e32 v23, s21
-; SI-NEXT:    v_mov_b32_e32 v21, s22
-; SI-NEXT:    v_mov_b32_e32 v18, s23
-; SI-NEXT:    v_mov_b32_e32 v17, s24
-; SI-NEXT:    v_mov_b32_e32 v16, s25
-; SI-NEXT:    v_mov_b32_e32 v48, s8
-; SI-NEXT:    v_mov_b32_e32 v39, s7
-; SI-NEXT:    v_mov_b32_e32 v38, s6
-; SI-NEXT:    v_mov_b32_e32 v37, s11
-; SI-NEXT:    v_mov_b32_e32 v36, s10
-; SI-NEXT:    v_mov_b32_e32 v35, s9
-; SI-NEXT:    v_mov_b32_e32 v33, s14
-; SI-NEXT:    v_mov_b32_e32 v32, s13
-; SI-NEXT:    v_mov_b32_e32 v30, s12
-; SI-NEXT:    v_mov_b32_e32 v27, s27
-; SI-NEXT:    v_mov_b32_e32 v26, s26
-; SI-NEXT:    v_mov_b32_e32 v25, s15
-; SI-NEXT:    v_mov_b32_e32 v22, s40
-; SI-NEXT:    v_mov_b32_e32 v20, s29
-; SI-NEXT:    v_mov_b32_e32 v19, s28
+; SI-NEXT:    v_mov_b32_e32 v13, s16
+; SI-NEXT:    v_mov_b32_e32 v12, s17
+; SI-NEXT:    v_mov_b32_e32 v10, s18
+; SI-NEXT:    v_mov_b32_e32 v7, s19
+; SI-NEXT:    v_mov_b32_e32 v6, s20
+; SI-NEXT:    v_mov_b32_e32 v5, s21
+; SI-NEXT:    v_mov_b32_e32 v4, s22
+; SI-NEXT:    v_mov_b32_e32 v3, s23
+; SI-NEXT:    v_mov_b32_e32 v2, s24
+; SI-NEXT:    v_mov_b32_e32 v1, s25
+; SI-NEXT:    v_mov_b32_e32 v25, s75
+; SI-NEXT:    v_mov_b32_e32 v24, s73
+; SI-NEXT:    v_mov_b32_e32 v23, s72
+; SI-NEXT:    v_mov_b32_e32 v22, s78
+; SI-NEXT:    v_mov_b32_e32 v21, s76
+; SI-NEXT:    v_mov_b32_e32 v20, s74
+; SI-NEXT:    v_mov_b32_e32 v19, s89
+; SI-NEXT:    v_mov_b32_e32 v18, s79
+; SI-NEXT:    v_mov_b32_e32 v17, s77
+; SI-NEXT:    v_mov_b32_e32 v16, s92
+; SI-NEXT:    v_mov_b32_e32 v15, s90
+; SI-NEXT:    v_mov_b32_e32 v14, s88
+; SI-NEXT:    v_mov_b32_e32 v11, s94
+; SI-NEXT:    v_mov_b32_e32 v9, s93
+; SI-NEXT:    v_mov_b32_e32 v8, s91
 ; SI-NEXT:  .LBB33_5: ; %end
-; SI-NEXT:    v_and_b32_e32 v34, 0xff, v34
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
-; SI-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; SI-NEXT:    v_or_b32_e32 v15, v34, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; SI-NEXT:    v_or_b32_e32 v13, v13, v14
-; SI-NEXT:    v_or_b32_e32 v13, v15, v13
-; SI-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v13, 0xff, v31
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 8, v48
-; SI-NEXT:    v_or_b32_e32 v13, v13, v14
-; SI-NEXT:    v_and_b32_e32 v14, 0xff, v39
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 24, v38
+; SI-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; SI-NEXT:    s_lshl_b32 s5, s60, 8
+; SI-NEXT:    v_or_b32_e32 v13, s5, v13
+; SI-NEXT:    s_and_b32 s5, s58, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    s_lshl_b32 s7, s56, 24
 ; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; SI-NEXT:    v_or_b32_e32 v14, v15, v14
-; SI-NEXT:    v_or_b32_e32 v13, v13, v14
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 4, v0
-; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
+; SI-NEXT:    s_or_b32 s5, s7, s5
+; SI-NEXT:    v_or_b32_e32 v13, s5, v13
+; SI-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v12, 0xff, v12
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v13, 0xff, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; SI-NEXT:    v_or_b32_e32 v12, v13, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 8, v25
+; SI-NEXT:    v_or_b32_e32 v12, v12, v13
+; SI-NEXT:    v_and_b32_e32 v13, 0xff, v24
+; SI-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; SI-NEXT:    s_lshl_b32 s5, s46, 8
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 24, v23
+; SI-NEXT:    v_or_b32_e32 v10, s5, v10
+; SI-NEXT:    s_and_b32 s5, s44, 0xff
 ; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; SI-NEXT:    v_or_b32_e32 v10, v10, v11
-; SI-NEXT:    v_or_b32_e32 v10, v12, v10
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 8, v0
-; SI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v10, 0xff, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v37
-; SI-NEXT:    v_or_b32_e32 v10, v10, v11
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v36
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 24, v35
+; SI-NEXT:    v_or_b32_e32 v13, v23, v13
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    s_lshl_b32 s7, s42, 24
+; SI-NEXT:    v_or_b32_e32 v12, v12, v13
+; SI-NEXT:    v_add_i32_e32 v13, vcc, 4, v0
 ; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; SI-NEXT:    v_or_b32_e32 v11, v12, v11
-; SI-NEXT:    v_or_b32_e32 v10, v10, v11
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 12, v0
-; SI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
+; SI-NEXT:    s_or_b32 s5, s7, s5
+; SI-NEXT:    buffer_store_dword v12, v13, s[0:3], 0 offen
+; SI-NEXT:    v_or_b32_e32 v10, s5, v10
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v10, 0xff, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; SI-NEXT:    v_or_b32_e32 v9, v10, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    v_or_b32_e32 v7, v9, v7
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 16, v0
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v12, vcc, 8, v0
+; SI-NEXT:    buffer_store_dword v10, v12, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v23
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 8, v33
-; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v32
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 24, v30
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 8, v22
+; SI-NEXT:    v_or_b32_e32 v7, v7, v10
+; SI-NEXT:    v_and_b32_e32 v10, 0xff, v21
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; SI-NEXT:    s_lshl_b32 s5, s40, 8
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 24, v20
+; SI-NEXT:    v_or_b32_e32 v6, s5, v6
+; SI-NEXT:    s_and_b32 s5, s28, 0xff
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; SI-NEXT:    v_or_b32_e32 v8, v9, v8
-; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 20, v0
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
+; SI-NEXT:    v_or_b32_e32 v10, v12, v10
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    s_lshl_b32 s7, s26, 24
+; SI-NEXT:    v_or_b32_e32 v7, v7, v10
+; SI-NEXT:    v_add_i32_e32 v10, vcc, 12, v0
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT:    s_or_b32 s5, s7, s5
+; SI-NEXT:    buffer_store_dword v7, v10, s[0:3], 0 offen
+; SI-NEXT:    v_or_b32_e32 v6, s5, v6
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v21
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
+; SI-NEXT:    v_add_i32_e32 v7, vcc, 16, v0
+; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT:    v_or_b32_e32 v6, v7, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; SI-NEXT:    v_or_b32_e32 v4, v4, v5
-; SI-NEXT:    v_or_b32_e32 v4, v6, v4
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 24, v0
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v27
-; SI-NEXT:    v_or_b32_e32 v4, v4, v5
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v25
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v19
+; SI-NEXT:    v_or_b32_e32 v5, v5, v6
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v18
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; SI-NEXT:    s_lshl_b32 s5, s14, 8
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v17
+; SI-NEXT:    v_or_b32_e32 v4, s5, v4
+; SI-NEXT:    s_and_b32 s5, s12, 0xff
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; SI-NEXT:    v_or_b32_e32 v6, v7, v6
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    s_lshl_b32 s7, s10, 24
+; SI-NEXT:    v_or_b32_e32 v5, v5, v6
+; SI-NEXT:    v_add_i32_e32 v6, vcc, 20, v0
 ; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    v_or_b32_e32 v4, v4, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s5, s7, s5
+; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; SI-NEXT:    v_or_b32_e32 v4, s5, v4
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v16
+; SI-NEXT:    v_or_b32_e32 v3, v3, v4
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v15
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_or_b32_e32 v3, v4, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; SI-NEXT:    s_lshl_b32 s5, s8, 8
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v14
+; SI-NEXT:    v_or_b32_e32 v2, s5, v2
+; SI-NEXT:    s_and_b32 s5, s6, 0xff
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_or_b32_e32 v1, v3, v1
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 32, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    v_or_b32_e32 v4, v5, v4
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    s_lshl_b32 s4, s4, 24
+; SI-NEXT:    v_or_b32_e32 v3, v3, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 28, v0
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT:    v_or_b32_e32 v2, s4, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v22
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
+; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v11
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v20
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v9
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v19
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v8
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -15184,124 +15193,127 @@ define inreg <20 x i16> @bitcast_v20f16_to_v20i16_scalar(<20 x half> inreg %a, i
 ; SI-LABEL: bitcast_v20f16_to_v20i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v19, v5
-; SI-NEXT:    v_mov_b32_e32 v18, v4
-; SI-NEXT:    v_mov_b32_e32 v17, v3
-; SI-NEXT:    v_mov_b32_e32 v16, v2
-; SI-NEXT:    v_mov_b32_e32 v15, v1
+; SI-NEXT:    v_mov_b32_e32 v9, v4
+; SI-NEXT:    v_mov_b32_e32 v10, v3
+; SI-NEXT:    v_mov_b32_e32 v13, v2
 ; SI-NEXT:    v_mov_b32_e32 v14, v0
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, s16
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v23, s17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, s18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, s19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, s21
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, s22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s24
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, s25
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, s27
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, s28
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v26, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v21, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, s22
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, s25
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, s26
+; SI-NEXT:    v_cvt_f16_f32_e32 v25, s29
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_cbranch_scc0 .LBB47_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_cbranch_execnz .LBB47_3
 ; SI-NEXT:  .LBB47_2: ; %cmp.true
-; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
-; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
 ; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
+; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
+; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v15
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v23
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; SI-NEXT:    v_or_b32_e32 v6, v6, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v24
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; SI-NEXT:    v_or_b32_e32 v10, v10, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v25
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
+; SI-NEXT:    v_or_b32_e32 v2, v2, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v21
+; SI-NEXT:    v_or_b32_e32 v14, v14, v17
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v26
+; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
 ; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
+; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
 ; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
-; SI-NEXT:    v_or_b32_e32 v18, v18, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
 ; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
 ; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
 ; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
-; SI-NEXT:    v_or_b32_e32 v14, v14, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; SI-NEXT:    v_or_b32_e32 v10, v10, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v7
-; SI-NEXT:    v_or_b32_e32 v6, v6, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v3
-; SI-NEXT:    v_or_b32_e32 v2, v2, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT:    v_lshr_b64 v[23:24], v[1:2], 16
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
+; SI-NEXT:    v_lshr_b64 v[24:25], v[9:10], 16
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_or_b32_e32 v18, v18, v20
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; SI-NEXT:    v_lshr_b64 v[25:26], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[21:22], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[26:27], v[17:18], 16
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v5
 ; SI-NEXT:    v_or_b32_e32 v8, v8, v9
 ; SI-NEXT:    v_or_b32_e32 v12, v12, v13
 ; SI-NEXT:    v_or_b32_e32 v16, v16, v17
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v9, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v13, v14, v13, 16
-; SI-NEXT:    v_alignbit_b32 v17, v18, v17, 16
 ; SI-NEXT:  .LBB47_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v1, v23
+; SI-NEXT:    v_mov_b32_e32 v5, v21
+; SI-NEXT:    v_mov_b32_e32 v9, v24
+; SI-NEXT:    v_mov_b32_e32 v13, v25
+; SI-NEXT:    v_mov_b32_e32 v17, v26
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB47_4:
 ; SI-NEXT:    s_branch .LBB47_2
@@ -16674,330 +16686,353 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32
 ; SI-LABEL: bitcast_v20i16_to_v40i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_writelane_b32 v8, s30, 0
+; SI-NEXT:    v_writelane_b32 v8, s31, 1
+; SI-NEXT:    v_writelane_b32 v8, s34, 2
+; SI-NEXT:    v_writelane_b32 v8, s35, 3
+; SI-NEXT:    v_writelane_b32 v8, s36, 4
+; SI-NEXT:    v_writelane_b32 v8, s37, 5
+; SI-NEXT:    v_writelane_b32 v8, s38, 6
+; SI-NEXT:    v_writelane_b32 v8, s39, 7
+; SI-NEXT:    v_writelane_b32 v8, s48, 8
+; SI-NEXT:    v_writelane_b32 v8, s49, 9
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; SI-NEXT:    v_readfirstlane_b32 s72, v6
-; SI-NEXT:    v_readfirstlane_b32 s73, v5
-; SI-NEXT:    v_readfirstlane_b32 s62, v2
-; SI-NEXT:    v_readfirstlane_b32 s63, v1
+; SI-NEXT:    v_writelane_b32 v8, s50, 10
+; SI-NEXT:    v_readfirstlane_b32 s39, v6
+; SI-NEXT:    v_readfirstlane_b32 s48, v5
+; SI-NEXT:    v_readfirstlane_b32 s49, v4
+; SI-NEXT:    v_readfirstlane_b32 s50, v3
+; SI-NEXT:    v_readfirstlane_b32 s35, v2
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v4
+; SI-NEXT:    v_readfirstlane_b32 s38, v1
 ; SI-NEXT:    s_cbranch_scc0 .LBB49_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s17, 16
-; SI-NEXT:    s_or_b32 s13, s4, s5
+; SI-NEXT:    s_or_b32 s12, s4, s5
 ; SI-NEXT:    s_and_b32 s4, s18, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s19, 16
-; SI-NEXT:    s_or_b32 s14, s4, s5
+; SI-NEXT:    s_or_b32 s13, s4, s5
 ; SI-NEXT:    s_and_b32 s4, s20, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s21, 16
-; SI-NEXT:    s_or_b32 s11, s4, s5
+; SI-NEXT:    s_or_b32 s10, s4, s5
 ; SI-NEXT:    s_and_b32 s4, s22, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s23, 16
-; SI-NEXT:    s_or_b32 s12, s4, s5
+; SI-NEXT:    s_or_b32 s11, s4, s5
 ; SI-NEXT:    s_and_b32 s4, s24, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s25, 16
-; SI-NEXT:    v_mov_b32_e32 v1, s13
-; SI-NEXT:    s_or_b32 s9, s4, s5
+; SI-NEXT:    s_or_b32 s8, s4, s5
 ; SI-NEXT:    s_and_b32 s4, s26, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s27, 16
-; SI-NEXT:    v_alignbit_b32 v7, s14, v1, 24
-; SI-NEXT:    v_alignbit_b32 v12, s14, v1, 16
-; SI-NEXT:    v_alignbit_b32 v16, s14, v1, 8
-; SI-NEXT:    v_mov_b32_e32 v1, s11
-; SI-NEXT:    s_or_b32 s10, s4, s5
+; SI-NEXT:    s_or_b32 s9, s4, s5
 ; SI-NEXT:    s_and_b32 s4, s28, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s29, 16
-; SI-NEXT:    v_alignbit_b32 v8, s12, v1, 24
-; SI-NEXT:    v_alignbit_b32 v13, s12, v1, 16
-; SI-NEXT:    v_alignbit_b32 v17, s12, v1, 8
-; SI-NEXT:    v_mov_b32_e32 v1, s9
-; SI-NEXT:    s_or_b32 s7, s4, s5
-; SI-NEXT:    s_and_b32 s4, s63, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s62, 16
-; SI-NEXT:    v_alignbit_b32 v6, s10, v1, 24
-; SI-NEXT:    v_alignbit_b32 v11, s10, v1, 16
-; SI-NEXT:    v_alignbit_b32 v15, s10, v1, 8
-; SI-NEXT:    s_or_b32 s8, s4, s5
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_alignbit_b32 v5, s8, v1, 24
-; SI-NEXT:    v_alignbit_b32 v9, s8, v1, 16
-; SI-NEXT:    v_alignbit_b32 v14, s8, v1, 8
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; SI-NEXT:    s_and_b32 s4, s73, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s72, 16
-; SI-NEXT:    v_or_b32_e32 v1, v1, v18
 ; SI-NEXT:    s_or_b32 s6, s4, s5
-; SI-NEXT:    v_alignbit_b32 v2, s6, v1, 24
-; SI-NEXT:    v_alignbit_b32 v4, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v10, s6, v1, 8
-; SI-NEXT:    s_lshr_b32 s59, s14, 8
-; SI-NEXT:    s_lshr_b32 s56, s12, 8
-; SI-NEXT:    s_lshr_b32 s45, s10, 8
-; SI-NEXT:    s_lshr_b32 s42, s8, 8
-; SI-NEXT:    s_lshr_b32 s15, s6, 8
-; SI-NEXT:    s_and_b32 s60, s19, 0xffff
-; SI-NEXT:    s_and_b32 s57, s23, 0xffff
-; SI-NEXT:    s_and_b32 s46, s27, 0xffff
-; SI-NEXT:    s_and_b32 s43, s62, 0xffff
-; SI-NEXT:    s_and_b32 s40, s72, 0xffff
-; SI-NEXT:    s_bfe_u32 s61, s19, 0x80008
-; SI-NEXT:    s_bfe_u32 s58, s23, 0x80008
-; SI-NEXT:    s_bfe_u32 s47, s27, 0x80008
-; SI-NEXT:    s_bfe_u32 s44, s62, 0x80008
-; SI-NEXT:    s_bfe_u32 s41, s72, 0x80008
+; SI-NEXT:    s_and_b32 s4, s38, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s35, 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[12:13], 24
+; SI-NEXT:    s_or_b32 s7, s4, s5
+; SI-NEXT:    s_and_b32 s4, s50, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s49, 16
+; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    s_and_b32 s5, s48, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s39, 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[12:13], 8
+; SI-NEXT:    s_lshr_b64 s[42:43], s[10:11], 24
+; SI-NEXT:    s_lshr_b64 s[46:47], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[10:11], 8
+; SI-NEXT:    s_lshr_b64 s[44:45], s[8:9], 24
+; SI-NEXT:    s_lshr_b64 s[58:59], s[8:9], 16
+; SI-NEXT:    s_or_b32 s5, s5, s15
+; SI-NEXT:    s_lshr_b32 s34, s13, 8
+; SI-NEXT:    s_lshr_b32 s95, s11, 8
+; SI-NEXT:    s_lshr_b32 s59, s9, 8
+; SI-NEXT:    s_lshr_b32 s45, s7, 8
+; SI-NEXT:    s_lshr_b32 s15, s5, 8
+; SI-NEXT:    s_and_b32 s36, s19, 0xffff
+; SI-NEXT:    s_and_b32 s30, s23, 0xffff
+; SI-NEXT:    s_and_b32 s61, s27, 0xffff
+; SI-NEXT:    s_and_b32 s47, s35, 0xffff
+; SI-NEXT:    s_and_b32 s41, s39, 0xffff
+; SI-NEXT:    s_bfe_u32 s37, s19, 0x80008
+; SI-NEXT:    s_bfe_u32 s31, s23, 0x80008
+; SI-NEXT:    s_bfe_u32 s94, s27, 0x80008
+; SI-NEXT:    s_bfe_u32 s57, s35, 0x80008
+; SI-NEXT:    s_bfe_u32 s43, s39, 0x80008
+; SI-NEXT:    s_lshr_b64 s[88:89], s[8:9], 8
+; SI-NEXT:    s_lshr_b64 s[78:79], s[6:7], 24
+; SI-NEXT:    s_lshr_b64 s[90:91], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[74:75], s[6:7], 8
+; SI-NEXT:    s_lshr_b64 s[62:63], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[72:73], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[4:5], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB49_3
 ; SI-NEXT:  .LBB49_2: ; %cmp.true
-; SI-NEXT:    s_add_i32 s73, s73, 3
-; SI-NEXT:    s_and_b32 s4, s73, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s72, 16
+; SI-NEXT:    s_add_i32 s50, s50, 3
+; SI-NEXT:    s_and_b32 s4, s50, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s49, 16
+; SI-NEXT:    s_add_i32 s48, s48, 3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    s_and_b32 s5, s48, 0xffff
+; SI-NEXT:    s_lshl_b32 s6, s39, 16
 ; SI-NEXT:    s_add_i32 s28, s28, 3
-; SI-NEXT:    s_add_i32 s6, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s28, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s29, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s63, s63, 3
-; SI-NEXT:    s_add_i32 s7, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s63, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s62, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    s_and_b32 s6, s28, 0xffff
+; SI-NEXT:    s_lshl_b32 s7, s29, 16
+; SI-NEXT:    s_add_i32 s38, s38, 3
+; SI-NEXT:    s_or_b32 s6, s7, s6
+; SI-NEXT:    s_and_b32 s7, s38, 0xffff
+; SI-NEXT:    s_lshl_b32 s8, s35, 16
 ; SI-NEXT:    s_add_i32 s24, s24, 3
-; SI-NEXT:    s_add_i32 s8, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s24, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s25, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    s_or_b32 s7, s8, s7
+; SI-NEXT:    s_and_b32 s8, s24, 0xffff
+; SI-NEXT:    s_lshl_b32 s9, s25, 16
 ; SI-NEXT:    s_add_i32 s26, s26, 3
-; SI-NEXT:    s_add_i32 s9, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s26, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s27, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    s_or_b32 s8, s9, s8
+; SI-NEXT:    s_and_b32 s9, s26, 0xffff
+; SI-NEXT:    s_lshl_b32 s10, s27, 16
 ; SI-NEXT:    s_add_i32 s20, s20, 3
-; SI-NEXT:    s_add_i32 s10, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s20, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s21, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    s_or_b32 s9, s10, s9
+; SI-NEXT:    s_and_b32 s10, s20, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s21, 16
 ; SI-NEXT:    s_add_i32 s22, s22, 3
-; SI-NEXT:    s_add_i32 s11, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s22, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s23, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    s_or_b32 s10, s11, s10
+; SI-NEXT:    s_and_b32 s11, s22, 0xffff
+; SI-NEXT:    s_lshl_b32 s12, s23, 16
 ; SI-NEXT:    s_add_i32 s16, s16, 3
-; SI-NEXT:    s_add_i32 s12, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s16, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s17, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    s_or_b32 s11, s12, s11
+; SI-NEXT:    s_and_b32 s12, s16, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s17, 16
 ; SI-NEXT:    s_add_i32 s18, s18, 3
-; SI-NEXT:    s_add_i32 s13, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s18, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s19, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v3
-; SI-NEXT:    s_add_i32 s14, s4, 0x30000
-; SI-NEXT:    v_mov_b32_e32 v2, s13
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_alignbit_b32 v7, s14, v2, 24
-; SI-NEXT:    v_alignbit_b32 v12, s14, v2, 16
-; SI-NEXT:    v_alignbit_b32 v16, s14, v2, 8
-; SI-NEXT:    v_mov_b32_e32 v2, s11
-; SI-NEXT:    v_or_b32_e32 v1, v18, v1
-; SI-NEXT:    v_alignbit_b32 v8, s12, v2, 24
-; SI-NEXT:    v_alignbit_b32 v13, s12, v2, 16
-; SI-NEXT:    v_alignbit_b32 v17, s12, v2, 8
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x30000, v1
-; SI-NEXT:    v_mov_b32_e32 v3, s6
-; SI-NEXT:    v_alignbit_b32 v6, s10, v2, 24
-; SI-NEXT:    v_alignbit_b32 v11, s10, v2, 16
-; SI-NEXT:    v_alignbit_b32 v15, s10, v2, 8
-; SI-NEXT:    v_mov_b32_e32 v2, s7
-; SI-NEXT:    v_alignbit_b32 v5, s8, v2, 24
-; SI-NEXT:    v_alignbit_b32 v9, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v14, s8, v2, 8
-; SI-NEXT:    v_alignbit_b32 v2, v3, v1, 24
-; SI-NEXT:    v_alignbit_b32 v4, v3, v1, 16
-; SI-NEXT:    v_alignbit_b32 v10, v3, v1, 8
-; SI-NEXT:    s_lshr_b32 s61, s14, 24
-; SI-NEXT:    s_lshr_b32 s60, s14, 16
-; SI-NEXT:    s_lshr_b32 s59, s14, 8
-; SI-NEXT:    s_lshr_b32 s58, s12, 24
-; SI-NEXT:    s_lshr_b32 s57, s12, 16
-; SI-NEXT:    s_lshr_b32 s56, s12, 8
-; SI-NEXT:    s_lshr_b32 s47, s10, 24
-; SI-NEXT:    s_lshr_b32 s46, s10, 16
-; SI-NEXT:    s_lshr_b32 s45, s10, 8
-; SI-NEXT:    s_lshr_b32 s44, s8, 24
-; SI-NEXT:    s_lshr_b32 s43, s8, 16
-; SI-NEXT:    s_lshr_b32 s42, s8, 8
-; SI-NEXT:    s_lshr_b32 s41, s6, 24
-; SI-NEXT:    s_lshr_b32 s40, s6, 16
-; SI-NEXT:    s_lshr_b32 s15, s6, 8
+; SI-NEXT:    s_or_b32 s12, s13, s12
+; SI-NEXT:    s_and_b32 s13, s18, 0xffff
+; SI-NEXT:    s_lshl_b32 s14, s19, 16
+; SI-NEXT:    s_or_b32 s13, s14, s13
+; SI-NEXT:    s_add_i32 s8, s8, 0x30000
+; SI-NEXT:    s_add_i32 s9, s9, 0x30000
+; SI-NEXT:    s_add_i32 s10, s10, 0x30000
+; SI-NEXT:    s_add_i32 s11, s11, 0x30000
+; SI-NEXT:    s_add_i32 s12, s12, 0x30000
+; SI-NEXT:    s_add_i32 s13, s13, 0x30000
+; SI-NEXT:    s_add_i32 s4, s4, 0x30000
+; SI-NEXT:    s_add_i32 s5, s5, 0x30000
+; SI-NEXT:    s_add_i32 s6, s6, 0x30000
+; SI-NEXT:    s_add_i32 s7, s7, 0x30000
+; SI-NEXT:    s_lshr_b64 s[14:15], s[12:13], 24
+; SI-NEXT:    s_lshr_b64 s[40:41], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[12:13], 8
+; SI-NEXT:    s_lshr_b64 s[42:43], s[10:11], 24
+; SI-NEXT:    s_lshr_b64 s[46:47], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[10:11], 8
+; SI-NEXT:    s_lshr_b64 s[44:45], s[8:9], 24
+; SI-NEXT:    s_lshr_b64 s[58:59], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[88:89], s[8:9], 8
+; SI-NEXT:    s_lshr_b64 s[78:79], s[6:7], 24
+; SI-NEXT:    s_lshr_b64 s[90:91], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[74:75], s[6:7], 8
+; SI-NEXT:    s_lshr_b64 s[62:63], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[72:73], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[4:5], 8
+; SI-NEXT:    s_lshr_b32 s37, s13, 24
+; SI-NEXT:    s_lshr_b32 s36, s13, 16
+; SI-NEXT:    s_lshr_b32 s34, s13, 8
+; SI-NEXT:    s_lshr_b32 s31, s11, 24
+; SI-NEXT:    s_lshr_b32 s30, s11, 16
+; SI-NEXT:    s_lshr_b32 s95, s11, 8
+; SI-NEXT:    s_lshr_b32 s94, s9, 24
+; SI-NEXT:    s_lshr_b32 s61, s9, 16
+; SI-NEXT:    s_lshr_b32 s59, s9, 8
+; SI-NEXT:    s_lshr_b32 s57, s7, 24
+; SI-NEXT:    s_lshr_b32 s47, s7, 16
+; SI-NEXT:    s_lshr_b32 s45, s7, 8
+; SI-NEXT:    s_lshr_b32 s43, s5, 24
+; SI-NEXT:    s_lshr_b32 s41, s5, 16
+; SI-NEXT:    s_lshr_b32 s15, s5, 8
 ; SI-NEXT:  .LBB49_3: ; %end
-; SI-NEXT:    s_and_b32 s4, s13, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v16
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s14, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s59, 8
-; SI-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s60, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s13, s61, 24
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v7, v7, v12
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s13, s5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v7
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; SI-NEXT:    s_and_b32 s12, s12, 0xff
+; SI-NEXT:    s_lshl_b32 s16, s56, 8
+; SI-NEXT:    s_or_b32 s12, s12, s16
+; SI-NEXT:    s_and_b32 s16, s40, 0xff
+; SI-NEXT:    s_lshl_b32 s16, s16, 16
+; SI-NEXT:    s_lshl_b32 s14, s14, 24
+; SI-NEXT:    s_and_b32 s12, s12, 0xffff
+; SI-NEXT:    s_or_b32 s14, s14, s16
+; SI-NEXT:    s_or_b32 s12, s12, s14
+; SI-NEXT:    v_mov_b32_e32 v1, s12
+; SI-NEXT:    s_and_b32 s12, s13, 0xff
+; SI-NEXT:    s_lshl_b32 s13, s34, 8
+; SI-NEXT:    s_or_b32 s12, s12, s13
+; SI-NEXT:    s_and_b32 s13, s36, 0xff
+; SI-NEXT:    s_lshl_b32 s13, s13, 16
+; SI-NEXT:    s_lshl_b32 s14, s37, 24
+; SI-NEXT:    s_and_b32 s12, s12, 0xffff
+; SI-NEXT:    s_or_b32 s13, s14, s13
+; SI-NEXT:    s_or_b32 s12, s12, s13
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    s_and_b32 s10, s10, 0xff
+; SI-NEXT:    s_lshl_b32 s12, s60, 8
+; SI-NEXT:    s_or_b32 s10, s10, s12
+; SI-NEXT:    s_and_b32 s12, s46, 0xff
+; SI-NEXT:    s_lshl_b32 s12, s12, 16
+; SI-NEXT:    s_lshl_b32 s13, s42, 24
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_or_b32 s12, s13, s12
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 4, v0
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
-; SI-NEXT:    s_and_b32 s4, s11, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v17
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s12, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s56, 8
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; SI-NEXT:    s_or_b32 s10, s10, s12
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v13
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s57, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s11, s58, 24
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v7, v8, v7
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s11, s5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 8, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v7, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s10, s11, 0xff
+; SI-NEXT:    s_lshl_b32 s11, s95, 8
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    s_and_b32 s11, s30, 0xff
+; SI-NEXT:    s_lshl_b32 s11, s11, 16
+; SI-NEXT:    s_lshl_b32 s12, s31, 24
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_or_b32 s11, s12, s11
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
-; SI-NEXT:    s_and_b32 s4, s9, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v15
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s10, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s45, 8
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s8, s8, 0xff
+; SI-NEXT:    s_lshl_b32 s10, s88, 8
+; SI-NEXT:    s_or_b32 s8, s8, s10
+; SI-NEXT:    s_and_b32 s10, s58, 0xff
+; SI-NEXT:    s_lshl_b32 s10, s10, 16
+; SI-NEXT:    s_lshl_b32 s11, s44, 24
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    s_or_b32 s10, s11, s10
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; SI-NEXT:    s_or_b32 s8, s8, s10
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v11
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s46, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s9, s47, 24
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v6, v6, v7
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s9, s5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v6
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 16, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s8, s9, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s59, 8
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    s_and_b32 s9, s61, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s10, s94, 24
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    s_or_b32 s9, s10, s9
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
-; SI-NEXT:    v_mov_b32_e32 v6, s4
-; SI-NEXT:    buffer_store_dword v6, v3, s[0:3], 0 offen
-; SI-NEXT:    s_and_b32 s4, s7, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v14
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s8, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s42, 8
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s6, s6, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s74, 8
+; SI-NEXT:    s_or_b32 s6, s6, s8
+; SI-NEXT:    s_and_b32 s8, s90, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_lshl_b32 s9, s78, 24
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_or_b32 s8, s9, s8
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 20, v0
+; SI-NEXT:    s_or_b32 s6, s6, s8
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v9
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s43, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s7, s44, 24
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s6, s7, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s45, 8
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    s_and_b32 s7, s47, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_lshl_b32 s8, s57, 24
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_or_b32 s7, s8, s7
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 24, v0
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    s_lshl_b32 s6, s76, 8
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    s_and_b32 s6, s72, 0xff
+; SI-NEXT:    s_lshl_b32 s6, s6, 16
+; SI-NEXT:    s_lshl_b32 s7, s62, 24
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s7, s5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 24, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT:    s_or_b32 s6, s7, s6
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
-; SI-NEXT:    v_mov_b32_e32 v5, s4
-; SI-NEXT:    buffer_store_dword v5, v3, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v10
-; SI-NEXT:    s_and_b32 s4, s6, 0xff
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    s_and_b32 s4, s5, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s15, 8
-; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v4
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s40, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; SI-NEXT:    s_and_b32 s5, s41, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s6, s41, 24
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    s_lshl_b32 s6, s43, 24
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s6, s5
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 32, v0
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    v_readlane_b32 s50, v8, 10
+; SI-NEXT:    v_readlane_b32 s49, v8, 9
+; SI-NEXT:    v_readlane_b32 s48, v8, 8
+; SI-NEXT:    v_readlane_b32 s39, v8, 7
+; SI-NEXT:    v_readlane_b32 s38, v8, 6
+; SI-NEXT:    v_readlane_b32 s37, v8, 5
+; SI-NEXT:    v_readlane_b32 s36, v8, 4
+; SI-NEXT:    v_readlane_b32 s35, v8, 3
+; SI-NEXT:    v_readlane_b32 s34, v8, 2
+; SI-NEXT:    v_readlane_b32 s31, v8, 1
+; SI-NEXT:    v_readlane_b32 s30, v8, 0
+; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB49_4:
-; SI-NEXT:    ; implicit-def: $sgpr13
-; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $sgpr14
-; SI-NEXT:    ; implicit-def: $sgpr59
-; SI-NEXT:    ; implicit-def: $sgpr60
-; SI-NEXT:    ; implicit-def: $sgpr61
-; SI-NEXT:    ; implicit-def: $sgpr11
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr8
 ; SI-NEXT:    ; implicit-def: $sgpr12
 ; SI-NEXT:    ; implicit-def: $sgpr56
-; SI-NEXT:    ; implicit-def: $sgpr57
-; SI-NEXT:    ; implicit-def: $sgpr58
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    ; implicit-def: $sgpr40
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr34
+; SI-NEXT:    ; implicit-def: $sgpr36
+; SI-NEXT:    ; implicit-def: $sgpr37
 ; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $sgpr45
+; SI-NEXT:    ; implicit-def: $sgpr60
 ; SI-NEXT:    ; implicit-def: $sgpr46
-; SI-NEXT:    ; implicit-def: $sgpr47
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $sgpr8
 ; SI-NEXT:    ; implicit-def: $sgpr42
-; SI-NEXT:    ; implicit-def: $sgpr43
+; SI-NEXT:    ; implicit-def: $sgpr95
+; SI-NEXT:    ; implicit-def: $sgpr30
+; SI-NEXT:    ; implicit-def: $sgpr31
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    ; implicit-def: $sgpr58
 ; SI-NEXT:    ; implicit-def: $sgpr44
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; implicit-def: $sgpr59
+; SI-NEXT:    ; implicit-def: $sgpr61
+; SI-NEXT:    ; implicit-def: $sgpr94
 ; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr74
+; SI-NEXT:    ; implicit-def: $sgpr45
+; SI-NEXT:    ; implicit-def: $sgpr47
+; SI-NEXT:    ; implicit-def: $sgpr57
 ; SI-NEXT:    ; implicit-def: $sgpr15
-; SI-NEXT:    ; implicit-def: $sgpr40
 ; SI-NEXT:    ; implicit-def: $sgpr41
+; SI-NEXT:    ; implicit-def: $sgpr43
+; SI-NEXT:    ; implicit-def: $sgpr90
+; SI-NEXT:    ; implicit-def: $sgpr78
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr76
+; SI-NEXT:    ; implicit-def: $sgpr72
+; SI-NEXT:    ; implicit-def: $sgpr62
 ; SI-NEXT:    s_branch .LBB49_2
 ;
 ; VI-LABEL: bitcast_v20i16_to_v40i8_scalar:
@@ -19045,286 +19080,280 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
-; SI-NEXT:    v_readfirstlane_b32 s14, v19
-; SI-NEXT:    v_readfirstlane_b32 s40, v18
-; SI-NEXT:    v_readfirstlane_b32 s12, v11
-; SI-NEXT:    v_readfirstlane_b32 s13, v10
-; SI-NEXT:    v_readfirstlane_b32 s8, v3
-; SI-NEXT:    v_readfirstlane_b32 s9, v2
-; SI-NEXT:    v_readfirstlane_b32 s7, v1
-; SI-NEXT:    v_readfirstlane_b32 s6, v0
+; SI-NEXT:    v_mov_b32_e32 v31, v18
+; SI-NEXT:    v_mov_b32_e32 v32, v14
+; SI-NEXT:    v_mov_b32_e32 v33, v10
+; SI-NEXT:    v_readfirstlane_b32 s43, v1
+; SI-NEXT:    v_readfirstlane_b32 s42, v0
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 8, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v36, 8, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v35, 24, v13
+; SI-NEXT:    v_lshlrev_b32_e32 v38, 8, v19
+; SI-NEXT:    v_lshlrev_b32_e32 v37, 24, v21
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v7
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 8, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 24, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v30, 8, v23
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 24, v25
-; SI-NEXT:    v_lshlrev_b32_e32 v31, 24, v21
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v15
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v17
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 8, v23
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 24, v25
 ; SI-NEXT:    s_cbranch_scc0 .LBB51_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_and_b32 s4, s20, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s21, 8
+; SI-NEXT:    s_and_b32 s4, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s17, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s22, 0xff
+; SI-NEXT:    s_and_b32 s5, s18, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s10, s23, 24
+; SI-NEXT:    s_lshl_b32 s6, s19, 24
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s10, s5
-; SI-NEXT:    s_or_b32 s11, s4, s5
-; SI-NEXT:    s_and_b32 s4, s18, 0xff
-; SI-NEXT:    s_lshl_b32 s4, s4, 16
-; SI-NEXT:    s_lshl_b32 s10, s19, 24
-; SI-NEXT:    s_or_b32 s4, s10, s4
-; SI-NEXT:    s_and_b32 s10, s28, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s29, 8
-; SI-NEXT:    s_or_b32 s10, s10, s15
-; SI-NEXT:    s_and_b32 s15, s6, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s15, 16
-; SI-NEXT:    s_lshl_b32 s41, s7, 24
-; SI-NEXT:    s_or_b32 s43, s41, s15
-; SI-NEXT:    s_and_b32 s15, s26, 0xff
-; SI-NEXT:    s_lshl_b32 s15, s15, 16
-; SI-NEXT:    s_lshl_b32 s41, s27, 24
-; SI-NEXT:    s_or_b32 s15, s41, s15
-; SI-NEXT:    s_and_b32 s41, s16, 0xff
-; SI-NEXT:    s_lshl_b32 s42, s17, 8
-; SI-NEXT:    s_or_b32 s41, s41, s42
-; SI-NEXT:    s_and_b32 s41, s41, 0xffff
-; SI-NEXT:    v_mov_b32_e32 v1, s4
-; SI-NEXT:    v_and_b32_e32 v9, 0xff, v6
-; SI-NEXT:    v_and_b32_e32 v10, 0xff, v8
-; SI-NEXT:    s_or_b32 s41, s41, s4
+; SI-NEXT:    s_or_b32 s12, s6, s5
+; SI-NEXT:    s_or_b32 s6, s4, s12
 ; SI-NEXT:    s_and_b32 s4, s24, 0xff
-; SI-NEXT:    s_lshl_b32 s42, s25, 8
-; SI-NEXT:    v_or_b32_e32 v9, v9, v2
+; SI-NEXT:    s_lshl_b32 s5, s25, 8
+; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    s_and_b32 s5, s26, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    s_lshl_b32 s7, s27, 24
+; SI-NEXT:    s_or_b32 s14, s7, s5
+; SI-NEXT:    s_and_b32 s5, s20, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s21, 8
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    s_and_b32 s7, s22, 0xff
+; SI-NEXT:    v_and_b32_e32 v10, 0xff, v33
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_lshl_b32 s8, s23, 24
+; SI-NEXT:    v_or_b32_e32 v10, v10, v36
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s7, s8, s7
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v10
+; SI-NEXT:    v_and_b32_e32 v10, 0xff, v12
+; SI-NEXT:    s_or_b32 s13, s5, s7
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT:    s_or_b32 s4, s4, s42
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; SI-NEXT:    v_or_b32_e32 v11, v0, v10
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_mov_b32_e32 v5, s15
-; SI-NEXT:    v_or_b32_e32 v10, v9, v11
+; SI-NEXT:    s_lshr_b64 s[8:9], s[12:13], 16
+; SI-NEXT:    v_or_b32_e32 v13, v35, v10
+; SI-NEXT:    v_and_b32_e32 v10, 0xff, v20
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v8
+; SI-NEXT:    v_and_b32_e32 v18, 0xff, v16
+; SI-NEXT:    s_and_b32 s5, s28, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s29, 8
+; SI-NEXT:    v_and_b32_e32 v9, 0xff, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; SI-NEXT:    v_and_b32_e32 v25, 0xff, v24
+; SI-NEXT:    s_or_b32 s5, s5, s9
+; SI-NEXT:    s_and_b32 s9, s42, 0xff
+; SI-NEXT:    v_or_b32_e32 v9, v9, v34
+; SI-NEXT:    v_or_b32_e32 v17, v37, v10
+; SI-NEXT:    v_and_b32_e32 v10, 0xff, v6
+; SI-NEXT:    v_or_b32_e32 v19, v0, v14
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v32
+; SI-NEXT:    v_or_b32_e32 v39, v5, v18
+; SI-NEXT:    v_and_b32_e32 v18, 0xff, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s10, s43, 24
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v9
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v4
-; SI-NEXT:    v_and_b32_e32 v13, 0xff, v14
-; SI-NEXT:    v_and_b32_e32 v17, 0xff, v16
-; SI-NEXT:    s_or_b32 s15, s4, s15
-; SI-NEXT:    s_and_b32 s4, s9, 0xff
-; SI-NEXT:    s_lshl_b32 s42, s8, 8
+; SI-NEXT:    v_or_b32_e32 v10, v10, v3
+; SI-NEXT:    v_or_b32_e32 v14, v14, v7
+; SI-NEXT:    v_or_b32_e32 v18, v18, v23
+; SI-NEXT:    v_or_b32_e32 v48, v21, v25
+; SI-NEXT:    v_and_b32_e32 v25, 0xff, v31
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s12, s10, s9
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_or_b32_e32 v13, v13, v27
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT:    s_or_b32 s4, s4, s42
-; SI-NEXT:    v_or_b32_e32 v15, v3, v9
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; SI-NEXT:    v_or_b32_e32 v19, v7, v17
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v25, v13, v19
-; SI-NEXT:    v_and_b32_e32 v13, 0xff, v12
-; SI-NEXT:    v_and_b32_e32 v17, 0xff, v22
-; SI-NEXT:    v_and_b32_e32 v18, 0xff, v24
-; SI-NEXT:    v_or_b32_e32 v23, s4, v15
-; SI-NEXT:    s_and_b32 s4, s13, 0xff
-; SI-NEXT:    s_lshl_b32 s42, s12, 8
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_or_b32_e32 v17, v17, v30
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; SI-NEXT:    s_or_b32 s4, s4, s42
-; SI-NEXT:    v_or_b32_e32 v21, v28, v13
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; SI-NEXT:    v_or_b32_e32 v32, v29, v18
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v18, v17, v32
-; SI-NEXT:    v_and_b32_e32 v17, 0xff, v20
-; SI-NEXT:    v_or_b32_e32 v26, s4, v21
-; SI-NEXT:    s_and_b32 s4, s40, 0xff
-; SI-NEXT:    s_lshl_b32 s42, s14, 8
-; SI-NEXT:    s_and_b32 s10, s10, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT:    s_or_b32 s4, s4, s42
-; SI-NEXT:    s_or_b32 s10, s10, s43
-; SI-NEXT:    v_or_b32_e32 v33, v31, v17
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; SI-NEXT:    v_or_b32_e32 v25, v25, v38
+; SI-NEXT:    s_or_b32 s15, s5, s12
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_alignbit_b32 v1, s11, v1, 16
-; SI-NEXT:    v_alignbit_b32 v5, s10, v5, 16
-; SI-NEXT:    v_alignbit_b32 v9, v10, v15, 16
-; SI-NEXT:    v_alignbit_b32 v13, v25, v21, 16
-; SI-NEXT:    v_alignbit_b32 v17, v18, v33, 16
-; SI-NEXT:    v_or_b32_e32 v21, s4, v33
-; SI-NEXT:    s_lshr_b32 s42, s5, 16
-; SI-NEXT:    s_lshr_b32 s43, s43, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v32
+; SI-NEXT:    v_or_b32_e32 v9, v1, v9
+; SI-NEXT:    v_or_b32_e32 v10, v10, v19
+; SI-NEXT:    v_or_b32_e32 v14, v14, v39
+; SI-NEXT:    v_or_b32_e32 v18, v18, v48
+; SI-NEXT:    v_and_b32_e32 v29, 0xffff, v25
+; SI-NEXT:    s_lshr_b64 s[10:11], s[14:15], 16
+; SI-NEXT:    s_or_b32 s4, s4, s14
+; SI-NEXT:    v_or_b32_e32 v25, v11, v9
+; SI-NEXT:    v_mov_b32_e32 v26, v10
+; SI-NEXT:    v_lshr_b64 v[9:10], v[9:10], 16
+; SI-NEXT:    v_or_b32_e32 v27, v15, v13
+; SI-NEXT:    v_mov_b32_e32 v28, v14
+; SI-NEXT:    v_lshr_b64 v[13:14], v[13:14], 16
+; SI-NEXT:    v_or_b32_e32 v29, v29, v17
+; SI-NEXT:    v_mov_b32_e32 v30, v18
+; SI-NEXT:    v_lshr_b64 v[17:18], v[17:18], 16
+; SI-NEXT:    s_lshr_b32 s9, s7, 16
+; SI-NEXT:    s_lshr_b32 s11, s12, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v39
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v48
+; SI-NEXT:    s_mov_b32 s7, s13
+; SI-NEXT:    s_mov_b32 s5, s15
 ; SI-NEXT:    s_cbranch_execnz .LBB51_3
 ; SI-NEXT:  .LBB51_2: ; %cmp.true
-; SI-NEXT:    s_add_i32 s40, s40, 3
-; SI-NEXT:    s_and_b32 s4, s40, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s14, 8
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v20
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v1, v31, v1
-; SI-NEXT:    v_or_b32_e32 v1, s4, v1
-; SI-NEXT:    v_add_i32_e32 v21, vcc, 0x3000000, v1
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v22
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v24
-; SI-NEXT:    v_or_b32_e32 v1, v30, v1
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x300, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v5, v29, v5
-; SI-NEXT:    v_or_b32_e32 v1, v5, v1
-; SI-NEXT:    s_add_i32 s13, s13, 3
-; SI-NEXT:    v_add_i32_e32 v18, vcc, 0x3000000, v1
-; SI-NEXT:    s_and_b32 s4, s13, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s12, 8
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v12
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v1, v28, v1
-; SI-NEXT:    v_or_b32_e32 v1, s4, v1
-; SI-NEXT:    v_add_i32_e32 v26, vcc, 0x3000000, v1
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v14
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v16
-; SI-NEXT:    v_or_b32_e32 v1, v27, v1
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x300, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v5, v7, v5
-; SI-NEXT:    v_or_b32_e32 v1, v5, v1
-; SI-NEXT:    s_add_i32 s9, s9, 3
-; SI-NEXT:    v_add_i32_e32 v25, vcc, 0x3000000, v1
-; SI-NEXT:    s_and_b32 s4, s9, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s8, 8
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v4
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v31
+; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; SI-NEXT:    v_add_i32_e32 v10, vcc, 3, v20
+; SI-NEXT:    v_or_b32_e32 v9, v38, v9
+; SI-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 0x300, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; SI-NEXT:    v_or_b32_e32 v10, v37, v10
+; SI-NEXT:    v_or_b32_e32 v9, v10, v9
+; SI-NEXT:    v_add_i32_e32 v29, vcc, 0x3000000, v9
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v22
+; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; SI-NEXT:    v_add_i32_e32 v10, vcc, 3, v24
 ; SI-NEXT:    s_add_i32 s24, s24, 3
-; SI-NEXT:    v_or_b32_e32 v1, s4, v1
+; SI-NEXT:    v_or_b32_e32 v9, v23, v9
+; SI-NEXT:    v_and_b32_e32 v10, 0xff, v10
 ; SI-NEXT:    s_and_b32 s4, s24, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s25, 8
 ; SI-NEXT:    s_add_i32 s26, s26, 3
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 0x300, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_and_b32 s8, s26, 0xff
+; SI-NEXT:    s_and_b32 s6, s26, 0xff
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; SI-NEXT:    v_or_b32_e32 v10, v21, v10
 ; SI-NEXT:    s_addk_i32 s4, 0x300
 ; SI-NEXT:    s_lshl_b32 s5, s27, 24
-; SI-NEXT:    s_lshl_b32 s8, s8, 16
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s8
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s28, s28, 3
-; SI-NEXT:    s_add_i32 s15, s4, 0x3000000
-; SI-NEXT:    s_and_b32 s4, s28, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s29, 8
-; SI-NEXT:    s_add_i32 s6, s6, 3
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_and_b32 s6, s6, 0xff
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s7, 24
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
+; SI-NEXT:    v_or_b32_e32 v9, v10, v9
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s5, s6
+; SI-NEXT:    s_add_i32 s28, s28, 3
+; SI-NEXT:    v_add_i32_e32 v30, vcc, 0x3000000, v9
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v33
 ; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    s_and_b32 s5, s28, 0xff
+; SI-NEXT:    s_lshl_b32 s6, s29, 8
+; SI-NEXT:    s_add_i32 s42, s42, 3
+; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; SI-NEXT:    v_add_i32_e32 v10, vcc, 3, v12
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    s_and_b32 s7, s42, 0xff
+; SI-NEXT:    v_or_b32_e32 v9, v36, v9
+; SI-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 3, v4
+; SI-NEXT:    s_addk_i32 s5, 0x300
+; SI-NEXT:    s_lshl_b32 s6, s43, 24
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 0x300, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_or_b32_e32 v2, v34, v2
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s6, s6, s7
 ; SI-NEXT:    s_add_i32 s16, s16, 3
-; SI-NEXT:    s_add_i32 s10, s4, 0x3000000
-; SI-NEXT:    s_and_b32 s4, s16, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s17, 8
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; SI-NEXT:    v_or_b32_e32 v10, v35, v10
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x300, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    s_and_b32 s6, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s17, 8
 ; SI-NEXT:    s_add_i32 s18, s18, 3
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_and_b32 s6, s18, 0xff
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s19, 24
-; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    v_add_i32_e32 v23, vcc, 0x3000000, v1
+; SI-NEXT:    v_or_b32_e32 v9, v10, v9
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_or_b32_e32 v1, v1, v4
+; SI-NEXT:    s_or_b32 s6, s7, s6
+; SI-NEXT:    s_and_b32 s8, s18, 0xff
+; SI-NEXT:    v_add_i32_e32 v27, vcc, 0x3000000, v9
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v32
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    s_addk_i32 s6, 0x300
+; SI-NEXT:    s_lshl_b32 s7, s19, 24
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; SI-NEXT:    v_add_i32_e32 v25, vcc, 0x3000000, v1
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v6
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s6
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_or_b32 s7, s7, s8
 ; SI-NEXT:    s_add_i32 s20, s20, 3
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    v_or_b32_e32 v7, v7, v9
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 3, v16
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v8
-; SI-NEXT:    s_add_i32 s41, s4, 0x3000000
-; SI-NEXT:    s_and_b32 s4, s20, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s21, 8
+; SI-NEXT:    s_or_b32 s6, s7, s6
+; SI-NEXT:    s_and_b32 s7, s20, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s21, 8
 ; SI-NEXT:    s_add_i32 s22, s22, 3
+; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_and_b32 s6, s22, 0xff
+; SI-NEXT:    s_or_b32 s7, s8, s7
+; SI-NEXT:    s_and_b32 s9, s22, 0xff
+; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x300, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x300, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s23, 24
-; SI-NEXT:    s_lshl_b32 s6, s6, 16
+; SI-NEXT:    s_addk_i32 s7, 0x300
+; SI-NEXT:    s_lshl_b32 s8, s23, 24
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; SI-NEXT:    v_or_b32_e32 v5, v5, v9
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v2
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s6
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    v_or_b32_e32 v5, v5, v7
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 0x3000000, v0
-; SI-NEXT:    s_add_i32 s11, s4, 0x3000000
-; SI-NEXT:    v_mov_b32_e32 v0, s41
-; SI-NEXT:    v_alignbit_b32 v1, s11, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s15
-; SI-NEXT:    v_alignbit_b32 v5, s10, v0, 16
-; SI-NEXT:    v_alignbit_b32 v9, v10, v23, 16
-; SI-NEXT:    v_alignbit_b32 v13, v25, v26, 16
-; SI-NEXT:    v_alignbit_b32 v17, v18, v21, 16
-; SI-NEXT:    s_lshr_b32 s42, s11, 16
-; SI-NEXT:    s_lshr_b32 s43, s10, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
+; SI-NEXT:    s_or_b32 s7, s8, s7
+; SI-NEXT:    v_add_i32_e32 v28, vcc, 0x3000000, v5
+; SI-NEXT:    v_add_i32_e32 v26, vcc, 0x3000000, v0
+; SI-NEXT:    s_add_i32 s4, s4, 0x3000000
+; SI-NEXT:    s_add_i32 s5, s5, 0x3000000
+; SI-NEXT:    s_add_i32 s6, s6, 0x3000000
+; SI-NEXT:    s_add_i32 s7, s7, 0x3000000
+; SI-NEXT:    s_lshr_b64 s[8:9], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[4:5], 16
+; SI-NEXT:    v_lshr_b64 v[9:10], v[25:26], 16
+; SI-NEXT:    v_lshr_b64 v[13:14], v[27:28], 16
+; SI-NEXT:    v_lshr_b64 v[17:18], v[29:30], 16
+; SI-NEXT:    s_lshr_b32 s9, s7, 16
+; SI-NEXT:    s_lshr_b32 s11, s5, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v28
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v30
 ; SI-NEXT:  .LBB51_3: ; %end
-; SI-NEXT:    v_mov_b32_e32 v0, s41
-; SI-NEXT:    v_mov_b32_e32 v2, s11
-; SI-NEXT:    v_mov_b32_e32 v3, s42
-; SI-NEXT:    v_mov_b32_e32 v4, s15
-; SI-NEXT:    v_mov_b32_e32 v6, s10
-; SI-NEXT:    v_mov_b32_e32 v7, s43
-; SI-NEXT:    v_mov_b32_e32 v8, v23
-; SI-NEXT:    v_mov_b32_e32 v12, v26
-; SI-NEXT:    v_mov_b32_e32 v14, v25
-; SI-NEXT:    v_mov_b32_e32 v16, v21
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    v_mov_b32_e32 v3, s9
+; SI-NEXT:    v_mov_b32_e32 v4, s4
+; SI-NEXT:    v_mov_b32_e32 v5, s10
+; SI-NEXT:    v_mov_b32_e32 v6, s5
+; SI-NEXT:    v_mov_b32_e32 v7, s11
+; SI-NEXT:    v_mov_b32_e32 v8, v25
+; SI-NEXT:    v_mov_b32_e32 v10, v26
+; SI-NEXT:    v_mov_b32_e32 v12, v27
+; SI-NEXT:    v_mov_b32_e32 v14, v28
+; SI-NEXT:    v_mov_b32_e32 v16, v29
+; SI-NEXT:    v_mov_b32_e32 v18, v30
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB51_4:
-; SI-NEXT:    ; implicit-def: $sgpr41
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr11
-; SI-NEXT:    ; implicit-def: $sgpr42
-; SI-NEXT:    ; implicit-def: $sgpr15
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr9
+; SI-NEXT:    ; implicit-def: $sgpr4
 ; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $sgpr43
-; SI-NEXT:    ; implicit-def: $vgpr23
+; SI-NEXT:    ; implicit-def: $sgpr11
+; SI-NEXT:    ; implicit-def: $vgpr25
 ; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr10
 ; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr26
+; SI-NEXT:    ; implicit-def: $vgpr27
 ; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr25
 ; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr21
+; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr18
 ; SI-NEXT:    ; implicit-def: $vgpr19
 ; SI-NEXT:    s_branch .LBB51_2
 ;
@@ -20574,78 +20603,78 @@ define inreg <20 x i16> @bitcast_v5f64_to_v20i16_scalar(<5 x double> inreg %a, i
 ; SI-NEXT:    s_cmp_lg_u32 s26, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB55_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s24
-; SI-NEXT:    v_alignbit_b32 v20, s25, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v21, s23, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v22, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v23, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v24, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s10, s25, 16
-; SI-NEXT:    s_lshr_b32 s9, s23, 16
-; SI-NEXT:    s_lshr_b32 s8, s21, 16
-; SI-NEXT:    s_lshr_b32 s7, s19, 16
-; SI-NEXT:    s_lshr_b32 s6, s17, 16
+; SI-NEXT:    s_lshr_b32 s40, s25, 16
+; SI-NEXT:    s_lshr_b32 s29, s23, 16
+; SI-NEXT:    s_lshr_b32 s28, s21, 16
+; SI-NEXT:    s_lshr_b32 s27, s19, 16
+; SI-NEXT:    s_lshr_b32 s26, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB55_4
 ; SI-NEXT:  .LBB55_2: ; %cmp.true
-; SI-NEXT:    v_add_f64 v[0:1], s[16:17], 1.0
-; SI-NEXT:    v_add_f64 v[4:5], s[18:19], 1.0
-; SI-NEXT:    v_add_f64 v[8:9], s[20:21], 1.0
-; SI-NEXT:    v_add_f64 v[16:17], s[24:25], 1.0
-; SI-NEXT:    v_add_f64 v[12:13], s[22:23], 1.0
-; SI-NEXT:    v_alignbit_b32 v20, v17, v16, 16
-; SI-NEXT:    v_alignbit_b32 v21, v13, v12, 16
-; SI-NEXT:    v_alignbit_b32 v22, v9, v8, 16
-; SI-NEXT:    v_alignbit_b32 v23, v5, v4, 16
-; SI-NEXT:    v_alignbit_b32 v24, v1, v0, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; SI-NEXT:    v_add_f64 v[20:21], s[24:25], 1.0
+; SI-NEXT:    v_add_f64 v[22:23], s[22:23], 1.0
+; SI-NEXT:    v_add_f64 v[24:25], s[20:21], 1.0
+; SI-NEXT:    v_add_f64 v[26:27], s[18:19], 1.0
+; SI-NEXT:    v_add_f64 v[28:29], s[16:17], 1.0
+; SI-NEXT:    v_lshr_b64 v[17:18], v[20:21], 16
+; SI-NEXT:    v_lshr_b64 v[13:14], v[22:23], 16
+; SI-NEXT:    v_lshr_b64 v[9:10], v[24:25], 16
+; SI-NEXT:    v_lshr_b64 v[5:6], v[26:27], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[28:29], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v25
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v27
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v29
 ; SI-NEXT:    s_branch .LBB55_5
 ; SI-NEXT:  .LBB55_3:
-; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $vgpr20
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr26
 ; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr27
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr28
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr29
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr40
 ; SI-NEXT:    s_branch .LBB55_2
 ; SI-NEXT:  .LBB55_4:
-; SI-NEXT:    v_mov_b32_e32 v1, s17
-; SI-NEXT:    v_mov_b32_e32 v5, s19
-; SI-NEXT:    v_mov_b32_e32 v9, s21
-; SI-NEXT:    v_mov_b32_e32 v13, s23
-; SI-NEXT:    v_mov_b32_e32 v17, s25
-; SI-NEXT:    v_mov_b32_e32 v16, s24
-; SI-NEXT:    v_mov_b32_e32 v12, s22
-; SI-NEXT:    v_mov_b32_e32 v8, s20
-; SI-NEXT:    v_mov_b32_e32 v4, s18
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_mov_b32_e32 v3, s6
-; SI-NEXT:    v_mov_b32_e32 v7, s7
-; SI-NEXT:    v_mov_b32_e32 v11, s8
-; SI-NEXT:    v_mov_b32_e32 v15, s9
-; SI-NEXT:    v_mov_b32_e32 v19, s10
+; SI-NEXT:    v_mov_b32_e32 v21, s25
+; SI-NEXT:    v_mov_b32_e32 v23, s23
+; SI-NEXT:    v_mov_b32_e32 v25, s21
+; SI-NEXT:    v_mov_b32_e32 v27, s19
+; SI-NEXT:    v_mov_b32_e32 v29, s17
+; SI-NEXT:    v_mov_b32_e32 v28, s16
+; SI-NEXT:    v_mov_b32_e32 v26, s18
+; SI-NEXT:    v_mov_b32_e32 v24, s20
+; SI-NEXT:    v_mov_b32_e32 v22, s22
+; SI-NEXT:    v_mov_b32_e32 v20, s24
+; SI-NEXT:    v_mov_b32_e32 v19, s40
+; SI-NEXT:    v_mov_b32_e32 v15, s29
+; SI-NEXT:    v_mov_b32_e32 v11, s28
+; SI-NEXT:    v_mov_b32_e32 v7, s27
+; SI-NEXT:    v_mov_b32_e32 v3, s26
+; SI-NEXT:    v_mov_b32_e32 v1, s12
+; SI-NEXT:    v_mov_b32_e32 v5, s10
+; SI-NEXT:    v_mov_b32_e32 v9, s8
+; SI-NEXT:    v_mov_b32_e32 v13, s6
+; SI-NEXT:    v_mov_b32_e32 v17, s4
 ; SI-NEXT:  .LBB55_5: ; %end
-; SI-NEXT:    v_mov_b32_e32 v2, v1
-; SI-NEXT:    v_mov_b32_e32 v6, v5
-; SI-NEXT:    v_mov_b32_e32 v10, v9
-; SI-NEXT:    v_mov_b32_e32 v14, v13
-; SI-NEXT:    v_mov_b32_e32 v18, v17
-; SI-NEXT:    v_mov_b32_e32 v1, v24
-; SI-NEXT:    v_mov_b32_e32 v5, v23
-; SI-NEXT:    v_mov_b32_e32 v9, v22
-; SI-NEXT:    v_mov_b32_e32 v13, v21
-; SI-NEXT:    v_mov_b32_e32 v17, v20
+; SI-NEXT:    v_mov_b32_e32 v0, v28
+; SI-NEXT:    v_mov_b32_e32 v2, v29
+; SI-NEXT:    v_mov_b32_e32 v4, v26
+; SI-NEXT:    v_mov_b32_e32 v6, v27
+; SI-NEXT:    v_mov_b32_e32 v8, v24
+; SI-NEXT:    v_mov_b32_e32 v10, v25
+; SI-NEXT:    v_mov_b32_e32 v12, v22
+; SI-NEXT:    v_mov_b32_e32 v14, v23
+; SI-NEXT:    v_mov_b32_e32 v16, v20
+; SI-NEXT:    v_mov_b32_e32 v18, v21
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v5f64_to_v20i16_scalar:
@@ -21435,76 +21464,71 @@ define inreg <20 x i16> @bitcast_v5i64_to_v20i16_scalar(<5 x i64> inreg %a, i32
 ; SI-NEXT:    s_cmp_lg_u32 s26, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB59_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s24
-; SI-NEXT:    v_alignbit_b32 v17, s25, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v13, s23, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s25, 16
-; SI-NEXT:    s_lshr_b32 s7, s23, 16
-; SI-NEXT:    s_lshr_b32 s8, s21, 16
-; SI-NEXT:    s_lshr_b32 s9, s19, 16
-; SI-NEXT:    s_lshr_b32 s10, s17, 16
+; SI-NEXT:    s_lshr_b32 s26, s25, 16
+; SI-NEXT:    s_lshr_b32 s27, s23, 16
+; SI-NEXT:    s_lshr_b32 s28, s21, 16
+; SI-NEXT:    s_lshr_b32 s29, s19, 16
+; SI-NEXT:    s_lshr_b32 s40, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB59_3
 ; SI-NEXT:  .LBB59_2: ; %cmp.true
-; SI-NEXT:    s_add_u32 s16, s16, 3
-; SI-NEXT:    s_addc_u32 s17, s17, 0
-; SI-NEXT:    s_add_u32 s18, s18, 3
-; SI-NEXT:    s_addc_u32 s19, s19, 0
-; SI-NEXT:    s_add_u32 s20, s20, 3
-; SI-NEXT:    s_addc_u32 s21, s21, 0
-; SI-NEXT:    s_add_u32 s22, s22, 3
-; SI-NEXT:    s_addc_u32 s23, s23, 0
 ; SI-NEXT:    s_add_u32 s24, s24, 3
 ; SI-NEXT:    s_addc_u32 s25, s25, 0
-; SI-NEXT:    v_mov_b32_e32 v0, s24
-; SI-NEXT:    v_alignbit_b32 v17, s25, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v13, s23, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s25, 16
-; SI-NEXT:    s_lshr_b32 s7, s23, 16
-; SI-NEXT:    s_lshr_b32 s8, s21, 16
-; SI-NEXT:    s_lshr_b32 s9, s19, 16
-; SI-NEXT:    s_lshr_b32 s10, s17, 16
+; SI-NEXT:    s_add_u32 s22, s22, 3
+; SI-NEXT:    s_addc_u32 s23, s23, 0
+; SI-NEXT:    s_add_u32 s20, s20, 3
+; SI-NEXT:    s_addc_u32 s21, s21, 0
+; SI-NEXT:    s_add_u32 s18, s18, 3
+; SI-NEXT:    s_addc_u32 s19, s19, 0
+; SI-NEXT:    s_add_u32 s16, s16, 3
+; SI-NEXT:    s_addc_u32 s17, s17, 0
+; SI-NEXT:    s_lshr_b32 s26, s25, 16
+; SI-NEXT:    s_lshr_b32 s27, s23, 16
+; SI-NEXT:    s_lshr_b32 s28, s21, 16
+; SI-NEXT:    s_lshr_b32 s29, s19, 16
+; SI-NEXT:    s_lshr_b32 s40, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[16:17], 16
 ; SI-NEXT:  .LBB59_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
+; SI-NEXT:    v_mov_b32_e32 v1, s12
 ; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v3, s10
+; SI-NEXT:    v_mov_b32_e32 v3, s40
 ; SI-NEXT:    v_mov_b32_e32 v4, s18
+; SI-NEXT:    v_mov_b32_e32 v5, s10
 ; SI-NEXT:    v_mov_b32_e32 v6, s19
-; SI-NEXT:    v_mov_b32_e32 v7, s9
+; SI-NEXT:    v_mov_b32_e32 v7, s29
 ; SI-NEXT:    v_mov_b32_e32 v8, s20
+; SI-NEXT:    v_mov_b32_e32 v9, s8
 ; SI-NEXT:    v_mov_b32_e32 v10, s21
-; SI-NEXT:    v_mov_b32_e32 v11, s8
+; SI-NEXT:    v_mov_b32_e32 v11, s28
 ; SI-NEXT:    v_mov_b32_e32 v12, s22
+; SI-NEXT:    v_mov_b32_e32 v13, s6
 ; SI-NEXT:    v_mov_b32_e32 v14, s23
-; SI-NEXT:    v_mov_b32_e32 v15, s7
+; SI-NEXT:    v_mov_b32_e32 v15, s27
 ; SI-NEXT:    v_mov_b32_e32 v16, s24
+; SI-NEXT:    v_mov_b32_e32 v17, s4
 ; SI-NEXT:    v_mov_b32_e32 v18, s25
-; SI-NEXT:    v_mov_b32_e32 v19, s6
+; SI-NEXT:    v_mov_b32_e32 v19, s26
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB59_4:
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr40
 ; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $sgpr29
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr17
+; SI-NEXT:    ; implicit-def: $sgpr28
 ; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr27
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr26
 ; SI-NEXT:    s_branch .LBB59_2
 ;
 ; VI-LABEL: bitcast_v5i64_to_v20i16_scalar:
@@ -22779,357 +22803,375 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32
 ; SI-LABEL: bitcast_v20f16_to_v40i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, s17
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, s16
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, s16
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, s19
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, s18
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, s21
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, s20
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, s20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v9, s23
-; SI-NEXT:    v_cvt_f16_f32_e32 v20, s22
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, s25
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, s24
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, s22
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, s25
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, s24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s27
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, s26
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_cvt_f16_f32_e32 v43, s29
+; SI-NEXT:    v_cvt_f16_f32_e32 v17, s26
+; SI-NEXT:    v_cvt_f16_f32_e32 v21, s29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v41, v1
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v47, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v46, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v23, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v45, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v44, s28
+; SI-NEXT:    v_cvt_f16_f32_e32 v22, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, s28
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_cbranch_scc0 .LBB61_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v16
-; SI-NEXT:    v_or_b32_e32 v28, v15, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v10
-; SI-NEXT:    v_or_b32_e32 v24, v12, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v39
-; SI-NEXT:    v_or_b32_e32 v14, v33, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v9
-; SI-NEXT:    v_or_b32_e32 v13, v20, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v54
-; SI-NEXT:    v_or_b32_e32 v7, v53, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
-; SI-NEXT:    v_or_b32_e32 v11, v50, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v43
-; SI-NEXT:    v_or_b32_e32 v5, v44, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; SI-NEXT:    v_or_b32_e32 v6, v41, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v47
-; SI-NEXT:    v_or_b32_e32 v4, v46, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; SI-NEXT:    v_or_b32_e32 v3, v45, v3
-; SI-NEXT:    v_alignbit_b32 v30, v24, v28, 24
-; SI-NEXT:    v_alignbit_b32 v35, v24, v28, 16
-; SI-NEXT:    v_alignbit_b32 v37, v24, v28, 8
-; SI-NEXT:    v_alignbit_b32 v29, v13, v14, 24
-; SI-NEXT:    v_alignbit_b32 v31, v13, v14, 16
-; SI-NEXT:    v_alignbit_b32 v36, v13, v14, 8
-; SI-NEXT:    v_alignbit_b32 v23, v11, v7, 24
-; SI-NEXT:    v_alignbit_b32 v26, v11, v7, 16
-; SI-NEXT:    v_alignbit_b32 v32, v11, v7, 8
-; SI-NEXT:    v_alignbit_b32 v19, v6, v5, 24
-; SI-NEXT:    v_alignbit_b32 v21, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v27, v6, v5, 8
-; SI-NEXT:    v_alignbit_b32 v17, v3, v4, 24
-; SI-NEXT:    v_alignbit_b32 v18, v3, v4, 16
-; SI-NEXT:    v_alignbit_b32 v22, v3, v4, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v40, 8, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v52, 8, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 8, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 8, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 8, v3
-; SI-NEXT:    v_bfe_u32 v42, v10, 8, 8
-; SI-NEXT:    v_bfe_u32 v55, v9, 8, 8
-; SI-NEXT:    v_bfe_u32 v51, v8, 8, 8
-; SI-NEXT:    v_bfe_u32 v48, v2, 8, 8
-; SI-NEXT:    v_bfe_u32 v34, v1, 8, 8
+; SI-NEXT:    v_readfirstlane_b32 s4, v13
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v12
+; SI-NEXT:    s_or_b32 s12, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v10
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v11
+; SI-NEXT:    s_or_b32 s13, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v16
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v15
+; SI-NEXT:    s_or_b32 s10, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v9
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v14
+; SI-NEXT:    s_or_b32 s11, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v19
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v18
+; SI-NEXT:    s_or_b32 s8, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v8
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v17
+; SI-NEXT:    s_or_b32 s9, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v21
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v5
+; SI-NEXT:    s_or_b32 s6, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v2
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v20
+; SI-NEXT:    s_or_b32 s7, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v24
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v23
+; SI-NEXT:    s_lshr_b64 s[14:15], s[12:13], 24
+; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s5, v1
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    v_readfirstlane_b32 s15, v22
+; SI-NEXT:    s_lshr_b64 s[16:17], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[20:21], s[12:13], 8
+; SI-NEXT:    s_lshr_b64 s[18:19], s[10:11], 24
+; SI-NEXT:    s_lshr_b64 s[22:23], s[10:11], 16
+; SI-NEXT:    s_or_b32 s5, s15, s5
+; SI-NEXT:    s_lshr_b64 s[26:27], s[10:11], 8
+; SI-NEXT:    s_lshr_b64 s[24:25], s[8:9], 24
+; SI-NEXT:    s_lshr_b64 s[28:29], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[8:9], 8
+; SI-NEXT:    s_lshr_b64 s[46:47], s[6:7], 24
+; SI-NEXT:    s_lshr_b64 s[42:43], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[6:7], 8
+; SI-NEXT:    s_lshr_b64 s[56:57], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[58:59], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[4:5], 8
+; SI-NEXT:    s_lshr_b32 s23, s13, 8
+; SI-NEXT:    s_lshr_b32 s21, s11, 8
+; SI-NEXT:    s_lshr_b32 s19, s9, 8
+; SI-NEXT:    s_lshr_b32 s17, s7, 8
+; SI-NEXT:    s_lshr_b32 s15, s5, 8
+; SI-NEXT:    v_bfe_u32 v25, v10, 8, 8
+; SI-NEXT:    v_bfe_u32 v7, v9, 8, 8
+; SI-NEXT:    v_bfe_u32 v6, v8, 8, 8
+; SI-NEXT:    v_bfe_u32 v4, v2, 8, 8
+; SI-NEXT:    v_bfe_u32 v3, v1, 8, 8
 ; SI-NEXT:    s_cbranch_execnz .LBB61_3
 ; SI-NEXT:  .LBB61_2: ; %cmp.true
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v47
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v46
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v24
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v23
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v45
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_readfirstlane_b32 s4, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v22
+; SI-NEXT:    v_readfirstlane_b32 s5, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v21
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_or_b32_e32 v4, v4, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v43
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; SI-NEXT:    v_or_b32_e32 v3, v5, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v44
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v41
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v54
-; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s6, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v5
+; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s5, v1
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_readfirstlane_b32 s6, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v19
+; SI-NEXT:    v_readfirstlane_b32 s7, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v20
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
-; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v53
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT:    v_or_b32_e32 v6, v7, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v50
-; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v33
-; SI-NEXT:    v_or_b32_e32 v7, v13, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v39
-; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; SI-NEXT:    v_or_b32_e32 v11, v11, v14
-; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v17
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v20
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
-; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v15
-; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; SI-NEXT:    v_or_b32_e32 v14, v14, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v9
-; SI-NEXT:    v_or_b32_e32 v28, v15, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v10
-; SI-NEXT:    v_or_b32_e32 v13, v17, v13
-; SI-NEXT:    v_or_b32_e32 v24, v12, v15
-; SI-NEXT:    v_alignbit_b32 v30, v24, v28, 24
-; SI-NEXT:    v_alignbit_b32 v35, v24, v28, 16
-; SI-NEXT:    v_alignbit_b32 v37, v24, v28, 8
-; SI-NEXT:    v_alignbit_b32 v29, v13, v14, 24
-; SI-NEXT:    v_alignbit_b32 v31, v13, v14, 16
-; SI-NEXT:    v_alignbit_b32 v36, v13, v14, 8
-; SI-NEXT:    v_alignbit_b32 v23, v11, v7, 24
-; SI-NEXT:    v_alignbit_b32 v26, v11, v7, 16
-; SI-NEXT:    v_alignbit_b32 v32, v11, v7, 8
-; SI-NEXT:    v_alignbit_b32 v19, v6, v5, 24
-; SI-NEXT:    v_alignbit_b32 v21, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v27, v6, v5, 8
-; SI-NEXT:    v_alignbit_b32 v17, v3, v4, 24
-; SI-NEXT:    v_alignbit_b32 v18, v3, v4, 16
-; SI-NEXT:    v_alignbit_b32 v22, v3, v4, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v40, 8, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v52, 8, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 8, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 8, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 8, v3
-; SI-NEXT:    v_bfe_u32 v42, v10, 8, 8
-; SI-NEXT:    v_bfe_u32 v55, v9, 8, 8
-; SI-NEXT:    v_bfe_u32 v51, v8, 8, 8
-; SI-NEXT:    v_bfe_u32 v48, v2, 8, 8
-; SI-NEXT:    v_bfe_u32 v34, v1, 8, 8
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    s_lshl_b32 s6, s6, 16
+; SI-NEXT:    s_or_b32 s6, s7, s6
+; SI-NEXT:    v_readfirstlane_b32 s8, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v18
+; SI-NEXT:    v_readfirstlane_b32 s7, v2
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_or_b32 s7, s8, s7
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_readfirstlane_b32 s8, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v8
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    v_readfirstlane_b32 s9, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v17
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v16
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    s_or_b32 s8, s9, s8
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_readfirstlane_b32 s10, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v15
+; SI-NEXT:    v_readfirstlane_b32 s9, v8
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_or_b32 s9, s10, s9
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_readfirstlane_b32 s10, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v12
+; SI-NEXT:    s_lshl_b32 s10, s10, 16
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT:    v_readfirstlane_b32 s11, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v13
+; SI-NEXT:    s_or_b32 s10, s11, s10
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_readfirstlane_b32 s11, v9
+; SI-NEXT:    s_lshl_b32 s11, s11, 16
+; SI-NEXT:    v_readfirstlane_b32 s12, v3
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v5
+; SI-NEXT:    s_or_b32 s11, s12, s11
+; SI-NEXT:    v_readfirstlane_b32 s12, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v11
+; SI-NEXT:    s_lshl_b32 s12, s12, 16
+; SI-NEXT:    v_readfirstlane_b32 s13, v3
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v3
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    s_or_b32 s12, s13, s12
+; SI-NEXT:    v_readfirstlane_b32 s13, v10
+; SI-NEXT:    s_lshl_b32 s13, s13, 16
+; SI-NEXT:    v_readfirstlane_b32 s14, v3
+; SI-NEXT:    s_or_b32 s13, s14, s13
+; SI-NEXT:    s_lshr_b64 s[14:15], s[12:13], 24
+; SI-NEXT:    s_lshr_b64 s[16:17], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[20:21], s[12:13], 8
+; SI-NEXT:    s_lshr_b64 s[18:19], s[10:11], 24
+; SI-NEXT:    s_lshr_b64 s[22:23], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[26:27], s[10:11], 8
+; SI-NEXT:    s_lshr_b64 s[24:25], s[8:9], 24
+; SI-NEXT:    s_lshr_b64 s[28:29], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[8:9], 8
+; SI-NEXT:    s_lshr_b64 s[46:47], s[6:7], 24
+; SI-NEXT:    s_lshr_b64 s[42:43], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[6:7], 8
+; SI-NEXT:    s_lshr_b64 s[56:57], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[58:59], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[4:5], 8
+; SI-NEXT:    s_lshr_b32 s23, s13, 8
+; SI-NEXT:    s_lshr_b32 s21, s11, 8
+; SI-NEXT:    s_lshr_b32 s19, s9, 8
+; SI-NEXT:    s_lshr_b32 s17, s7, 8
+; SI-NEXT:    s_lshr_b32 s15, s5, 8
+; SI-NEXT:    v_bfe_u32 v25, v10, 8, 8
+; SI-NEXT:    v_bfe_u32 v7, v9, 8, 8
+; SI-NEXT:    v_bfe_u32 v6, v8, 8, 8
+; SI-NEXT:    v_bfe_u32 v4, v2, 8, 8
+; SI-NEXT:    v_bfe_u32 v3, v1, 8, 8
 ; SI-NEXT:  .LBB61_3: ; %end
-; SI-NEXT:    v_and_b32_e32 v12, 0xff, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 8, v37
-; SI-NEXT:    v_or_b32_e32 v12, v12, v15
-; SI-NEXT:    v_and_b32_e32 v15, 0xff, v35
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 24, v30
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; SI-NEXT:    v_or_b32_e32 v15, v16, v15
-; SI-NEXT:    v_or_b32_e32 v12, v12, v15
-; SI-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v12, 0xff, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 8, v40
-; SI-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; SI-NEXT:    v_or_b32_e32 v12, v12, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 24, v42
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; SI-NEXT:    v_or_b32_e32 v10, v15, v10
-; SI-NEXT:    v_or_b32_e32 v10, v12, v10
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 4, v0
-; SI-NEXT:    buffer_store_dword v10, v12, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v10, 0xff, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 8, v36
-; SI-NEXT:    v_or_b32_e32 v10, v10, v12
-; SI-NEXT:    v_and_b32_e32 v12, 0xff, v31
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 24, v29
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; SI-NEXT:    v_or_b32_e32 v12, v14, v12
-; SI-NEXT:    v_or_b32_e32 v10, v10, v12
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 8, v0
-; SI-NEXT:    buffer_store_dword v10, v12, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v10, 0xff, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 8, v52
-; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; SI-NEXT:    v_or_b32_e32 v10, v10, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 24, v55
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; SI-NEXT:    v_or_b32_e32 v9, v12, v9
-; SI-NEXT:    v_or_b32_e32 v9, v10, v9
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 12, v0
-; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v32
-; SI-NEXT:    v_or_b32_e32 v7, v7, v9
-; SI-NEXT:    v_and_b32_e32 v9, 0xff, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v23
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; SI-NEXT:    v_or_b32_e32 v9, v10, v9
-; SI-NEXT:    v_or_b32_e32 v7, v7, v9
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 16, v0
-; SI-NEXT:    buffer_store_dword v7, v9, s[0:3], 0 offen
+; SI-NEXT:    s_and_b32 s12, s12, 0xff
+; SI-NEXT:    s_lshl_b32 s20, s20, 8
+; SI-NEXT:    s_and_b32 s16, s16, 0xff
+; SI-NEXT:    s_or_b32 s12, s12, s20
+; SI-NEXT:    s_lshl_b32 s16, s16, 16
+; SI-NEXT:    s_lshl_b32 s14, s14, 24
+; SI-NEXT:    s_and_b32 s12, s12, 0xffff
+; SI-NEXT:    s_or_b32 s14, s14, s16
+; SI-NEXT:    s_or_b32 s12, s12, s14
+; SI-NEXT:    v_mov_b32_e32 v5, s12
+; SI-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen
+; SI-NEXT:    s_and_b32 s12, s13, 0xff
+; SI-NEXT:    s_lshl_b32 s13, s23, 8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v49
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; SI-NEXT:    v_or_b32_e32 v7, v7, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 24, v51
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; SI-NEXT:    v_or_b32_e32 v8, v9, v8
-; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 20, v0
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; SI-NEXT:    v_and_b32_e32 v5, 0xff, v10
+; SI-NEXT:    s_or_b32 s12, s12, s13
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v25
+; SI-NEXT:    s_and_b32 s12, s12, 0xffff
+; SI-NEXT:    v_or_b32_e32 v5, v10, v5
+; SI-NEXT:    v_or_b32_e32 v5, s12, v5
+; SI-NEXT:    s_and_b32 s10, s10, 0xff
+; SI-NEXT:    s_lshl_b32 s12, s26, 8
+; SI-NEXT:    s_or_b32 s10, s10, s12
+; SI-NEXT:    s_and_b32 s12, s22, 0xff
+; SI-NEXT:    s_lshl_b32 s12, s12, 16
+; SI-NEXT:    s_lshl_b32 s13, s18, 24
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_or_b32 s12, s13, s12
+; SI-NEXT:    v_add_i32_e32 v10, vcc, 4, v0
+; SI-NEXT:    s_or_b32 s10, s10, s12
+; SI-NEXT:    buffer_store_dword v5, v10, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v27
-; SI-NEXT:    v_or_b32_e32 v5, v5, v7
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v21
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 24, v19
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; SI-NEXT:    v_or_b32_e32 v7, v8, v7
-; SI-NEXT:    v_or_b32_e32 v5, v5, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 24, v0
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 8, v0
+; SI-NEXT:    v_mov_b32_e32 v10, s10
+; SI-NEXT:    buffer_store_dword v10, v5, s[0:3], 0 offen
+; SI-NEXT:    s_and_b32 s10, s11, 0xff
+; SI-NEXT:    s_lshl_b32 s11, s21, 8
+; SI-NEXT:    v_and_b32_e32 v5, 0xff, v9
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    v_or_b32_e32 v5, v7, v5
+; SI-NEXT:    v_or_b32_e32 v5, s10, v5
+; SI-NEXT:    s_and_b32 s8, s8, 0xff
+; SI-NEXT:    s_lshl_b32 s10, s40, 8
+; SI-NEXT:    s_or_b32 s8, s8, s10
+; SI-NEXT:    s_and_b32 s10, s28, 0xff
+; SI-NEXT:    s_lshl_b32 s10, s10, 16
+; SI-NEXT:    s_lshl_b32 s11, s24, 24
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    s_or_b32 s10, s11, s10
+; SI-NEXT:    v_add_i32_e32 v7, vcc, 12, v0
+; SI-NEXT:    s_or_b32 s8, s8, s10
 ; SI-NEXT:    buffer_store_dword v5, v7, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v38
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 16, v0
+; SI-NEXT:    v_mov_b32_e32 v7, s8
+; SI-NEXT:    buffer_store_dword v7, v5, s[0:3], 0 offen
+; SI-NEXT:    s_and_b32 s8, s9, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s19, 8
+; SI-NEXT:    v_and_b32_e32 v5, 0xff, v8
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    v_or_b32_e32 v5, v6, v5
+; SI-NEXT:    v_or_b32_e32 v5, s8, v5
+; SI-NEXT:    s_and_b32 s6, s6, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s44, 8
+; SI-NEXT:    s_or_b32 s6, s6, s8
+; SI-NEXT:    s_and_b32 s8, s42, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_lshl_b32 s9, s46, 24
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_or_b32 s8, s9, s8
+; SI-NEXT:    v_add_i32_e32 v6, vcc, 20, v0
+; SI-NEXT:    s_or_b32 s6, s6, s8
+; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v6, s6
+; SI-NEXT:    s_and_b32 s6, s7, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s17, 8
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
+; SI-NEXT:    s_or_b32 s6, s6, s7
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v48
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; SI-NEXT:    v_or_b32_e32 v2, v6, v2
-; SI-NEXT:    v_or_b32_e32 v2, v5, v2
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 28, v0
-; SI-NEXT:    buffer_store_dword v2, v5, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    v_or_b32_e32 v2, v4, v2
+; SI-NEXT:    v_or_b32_e32 v2, s6, v2
+; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    s_lshl_b32 s6, s60, 8
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    s_and_b32 s6, s58, 0xff
+; SI-NEXT:    s_lshl_b32 s6, s6, 16
+; SI-NEXT:    s_lshl_b32 s7, s56, 24
+; SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; SI-NEXT:    s_or_b32 s6, s7, s6
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v22
-; SI-NEXT:    v_or_b32_e32 v2, v2, v4
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v17
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_or_b32_e32 v4, v5, v4
-; SI-NEXT:    v_or_b32_e32 v2, v2, v4
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 32, v0
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 24, v0
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
 ; SI-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v25
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 32, v0
+; SI-NEXT:    v_mov_b32_e32 v4, s4
+; SI-NEXT:    s_and_b32 s4, s5, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s15, 8
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
+; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v34
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    v_or_b32_e32 v1, s4, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB61_4:
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr37
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $vgpr40
-; SI-NEXT:    ; implicit-def: $vgpr42
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr52
-; SI-NEXT:    ; implicit-def: $vgpr55
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr20
+; SI-NEXT:    ; implicit-def: $sgpr16
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr23
+; SI-NEXT:    ; implicit-def: $vgpr25
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr26
+; SI-NEXT:    ; implicit-def: $sgpr22
+; SI-NEXT:    ; implicit-def: $sgpr18
+; SI-NEXT:    ; implicit-def: $sgpr21
 ; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; implicit-def: $vgpr51
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr19
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr40
+; SI-NEXT:    ; implicit-def: $sgpr28
+; SI-NEXT:    ; implicit-def: $sgpr24
+; SI-NEXT:    ; implicit-def: $sgpr19
 ; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $vgpr38
-; SI-NEXT:    ; implicit-def: $vgpr48
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr44
+; SI-NEXT:    ; implicit-def: $sgpr42
+; SI-NEXT:    ; implicit-def: $sgpr17
 ; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr17
 ; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr34
+; SI-NEXT:    ; implicit-def: $sgpr15
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr58
+; SI-NEXT:    ; implicit-def: $sgpr56
 ; SI-NEXT:    s_branch .LBB61_2
 ;
 ; VI-LABEL: bitcast_v20f16_to_v40i8_scalar:
@@ -31153,232 +31195,246 @@ define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32
 ; SI-NEXT:    s_cmp_lg_u32 s26, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB75_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v1, s24
-; SI-NEXT:    v_alignbit_b32 v2, s25, v1, 24
-; SI-NEXT:    v_alignbit_b32 v11, s25, v1, 16
-; SI-NEXT:    v_alignbit_b32 v12, s25, v1, 8
-; SI-NEXT:    v_mov_b32_e32 v1, s22
-; SI-NEXT:    v_alignbit_b32 v4, s23, v1, 24
-; SI-NEXT:    v_alignbit_b32 v13, s23, v1, 16
-; SI-NEXT:    v_alignbit_b32 v14, s23, v1, 8
-; SI-NEXT:    v_mov_b32_e32 v1, s20
-; SI-NEXT:    v_alignbit_b32 v6, s21, v1, 24
-; SI-NEXT:    v_alignbit_b32 v15, s21, v1, 16
-; SI-NEXT:    v_alignbit_b32 v16, s21, v1, 8
-; SI-NEXT:    v_mov_b32_e32 v1, s18
-; SI-NEXT:    v_alignbit_b32 v8, s19, v1, 24
-; SI-NEXT:    v_alignbit_b32 v10, s19, v1, 16
-; SI-NEXT:    v_alignbit_b32 v17, s19, v1, 8
-; SI-NEXT:    v_mov_b32_e32 v1, s16
-; SI-NEXT:    v_alignbit_b32 v18, s17, v1, 24
-; SI-NEXT:    v_alignbit_b32 v19, s17, v1, 16
-; SI-NEXT:    v_alignbit_b32 v20, s17, v1, 8
-; SI-NEXT:    s_lshr_b32 s6, s25, 24
-; SI-NEXT:    s_lshr_b32 s7, s25, 16
-; SI-NEXT:    s_lshr_b32 s8, s25, 8
-; SI-NEXT:    s_lshr_b32 s9, s23, 24
-; SI-NEXT:    s_lshr_b32 s10, s23, 16
-; SI-NEXT:    s_lshr_b32 s11, s23, 8
-; SI-NEXT:    s_lshr_b32 s12, s21, 24
-; SI-NEXT:    s_lshr_b32 s13, s21, 16
-; SI-NEXT:    s_lshr_b32 s14, s21, 8
-; SI-NEXT:    s_lshr_b32 s15, s19, 24
-; SI-NEXT:    s_lshr_b32 s26, s19, 16
-; SI-NEXT:    s_lshr_b32 s27, s19, 8
-; SI-NEXT:    s_lshr_b32 s28, s17, 24
-; SI-NEXT:    s_lshr_b32 s29, s17, 16
-; SI-NEXT:    s_lshr_b32 s40, s17, 8
+; SI-NEXT:    s_lshr_b32 s72, s25, 24
+; SI-NEXT:    s_lshr_b32 s73, s25, 16
+; SI-NEXT:    s_lshr_b32 s74, s25, 8
+; SI-NEXT:    s_lshr_b32 s75, s23, 24
+; SI-NEXT:    s_lshr_b32 s76, s23, 16
+; SI-NEXT:    s_lshr_b32 s77, s23, 8
+; SI-NEXT:    s_lshr_b32 s78, s21, 24
+; SI-NEXT:    s_lshr_b32 s79, s21, 16
+; SI-NEXT:    s_lshr_b32 s88, s21, 8
+; SI-NEXT:    s_lshr_b32 s89, s19, 24
+; SI-NEXT:    s_lshr_b32 s90, s19, 16
+; SI-NEXT:    s_lshr_b32 s91, s19, 8
+; SI-NEXT:    s_lshr_b32 s92, s17, 24
+; SI-NEXT:    s_lshr_b32 s93, s17, 16
+; SI-NEXT:    s_lshr_b32 s94, s17, 8
+; SI-NEXT:    s_lshr_b64 s[10:11], s[24:25], 24
+; SI-NEXT:    s_lshr_b64 s[4:5], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[24:25], 8
+; SI-NEXT:    s_lshr_b64 s[8:9], s[22:23], 24
+; SI-NEXT:    s_lshr_b64 s[12:13], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[22:23], 8
+; SI-NEXT:    s_lshr_b64 s[26:27], s[20:21], 24
+; SI-NEXT:    s_lshr_b64 s[28:29], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[20:21], 8
+; SI-NEXT:    s_lshr_b64 s[42:43], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[44:45], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[56:57], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[58:59], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[16:17], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB75_4
 ; SI-NEXT:  .LBB75_2: ; %cmp.true
-; SI-NEXT:    v_add_f64 v[9:10], s[16:17], 1.0
-; SI-NEXT:    v_add_f64 v[7:8], s[18:19], 1.0
-; SI-NEXT:    v_add_f64 v[5:6], s[20:21], 1.0
+; SI-NEXT:    v_add_f64 v[8:9], s[20:21], 1.0
+; SI-NEXT:    v_add_f64 v[15:16], s[18:19], 1.0
+; SI-NEXT:    v_lshr_b64 v[22:23], v[8:9], 8
 ; SI-NEXT:    v_add_f64 v[1:2], s[24:25], 1.0
+; SI-NEXT:    v_lshr_b64 v[23:24], v[15:16], 24
 ; SI-NEXT:    v_add_f64 v[3:4], s[22:23], 1.0
-; SI-NEXT:    v_readfirstlane_b32 s25, v2
+; SI-NEXT:    v_add_f64 v[20:21], s[16:17], 1.0
+; SI-NEXT:    v_lshr_b64 v[24:25], v[15:16], 16
+; SI-NEXT:    v_lshr_b64 v[10:11], v[1:2], 8
+; SI-NEXT:    v_lshr_b64 v[25:26], v[15:16], 8
+; SI-NEXT:    v_lshr_b64 v[11:12], v[3:4], 24
+; SI-NEXT:    v_lshr_b64 v[26:27], v[20:21], 24
+; SI-NEXT:    v_lshr_b64 v[5:6], v[1:2], 24
+; SI-NEXT:    v_lshr_b64 v[12:13], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[17:18], v[8:9], 24
+; SI-NEXT:    v_lshr_b64 v[27:28], v[20:21], 16
+; SI-NEXT:    v_readfirstlane_b32 s17, v21
+; SI-NEXT:    v_readfirstlane_b32 s19, v16
+; SI-NEXT:    v_readfirstlane_b32 s21, v9
 ; SI-NEXT:    v_readfirstlane_b32 s23, v4
-; SI-NEXT:    v_readfirstlane_b32 s21, v6
-; SI-NEXT:    v_readfirstlane_b32 s19, v8
-; SI-NEXT:    v_readfirstlane_b32 s17, v10
-; SI-NEXT:    v_alignbit_b32 v2, s25, v1, 24
-; SI-NEXT:    v_alignbit_b32 v11, s25, v1, 16
-; SI-NEXT:    v_alignbit_b32 v12, s25, v1, 8
-; SI-NEXT:    v_alignbit_b32 v4, s23, v3, 24
-; SI-NEXT:    v_alignbit_b32 v13, s23, v3, 16
-; SI-NEXT:    v_alignbit_b32 v14, s23, v3, 8
-; SI-NEXT:    v_alignbit_b32 v6, s21, v5, 24
-; SI-NEXT:    v_alignbit_b32 v15, s21, v5, 16
-; SI-NEXT:    v_alignbit_b32 v16, s21, v5, 8
-; SI-NEXT:    v_alignbit_b32 v8, s19, v7, 24
-; SI-NEXT:    s_lshr_b32 s6, s25, 24
-; SI-NEXT:    s_lshr_b32 s7, s25, 16
-; SI-NEXT:    s_lshr_b32 s8, s25, 8
-; SI-NEXT:    s_lshr_b32 s9, s23, 24
-; SI-NEXT:    s_lshr_b32 s10, s23, 16
-; SI-NEXT:    s_lshr_b32 s11, s23, 8
-; SI-NEXT:    s_lshr_b32 s12, s21, 24
-; SI-NEXT:    s_lshr_b32 s13, s21, 16
-; SI-NEXT:    s_lshr_b32 s14, s21, 8
-; SI-NEXT:    s_lshr_b32 s15, s19, 24
-; SI-NEXT:    s_lshr_b32 s26, s19, 16
-; SI-NEXT:    s_lshr_b32 s27, s19, 8
-; SI-NEXT:    s_lshr_b32 s28, s17, 24
-; SI-NEXT:    s_lshr_b32 s29, s17, 16
-; SI-NEXT:    s_lshr_b32 s40, s17, 8
-; SI-NEXT:    v_alignbit_b32 v10, s19, v7, 16
-; SI-NEXT:    v_alignbit_b32 v17, s19, v7, 8
-; SI-NEXT:    v_alignbit_b32 v18, s17, v9, 24
-; SI-NEXT:    v_alignbit_b32 v19, s17, v9, 16
-; SI-NEXT:    v_alignbit_b32 v20, s17, v9, 8
+; SI-NEXT:    v_readfirstlane_b32 s25, v2
+; SI-NEXT:    v_lshr_b64 v[6:7], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[13:14], v[3:4], 8
+; SI-NEXT:    v_lshr_b64 v[18:19], v[8:9], 16
+; SI-NEXT:    v_lshr_b64 v[28:29], v[20:21], 8
+; SI-NEXT:    s_lshr_b32 s72, s25, 24
+; SI-NEXT:    s_lshr_b32 s73, s25, 16
+; SI-NEXT:    s_lshr_b32 s74, s25, 8
+; SI-NEXT:    s_lshr_b32 s75, s23, 24
+; SI-NEXT:    s_lshr_b32 s76, s23, 16
+; SI-NEXT:    s_lshr_b32 s77, s23, 8
+; SI-NEXT:    s_lshr_b32 s78, s21, 24
+; SI-NEXT:    s_lshr_b32 s79, s21, 16
+; SI-NEXT:    s_lshr_b32 s88, s21, 8
+; SI-NEXT:    s_lshr_b32 s89, s19, 24
+; SI-NEXT:    s_lshr_b32 s90, s19, 16
+; SI-NEXT:    s_lshr_b32 s91, s19, 8
+; SI-NEXT:    s_lshr_b32 s92, s17, 24
+; SI-NEXT:    s_lshr_b32 s93, s17, 16
+; SI-NEXT:    s_lshr_b32 s94, s17, 8
 ; SI-NEXT:    s_branch .LBB75_5
 ; SI-NEXT:  .LBB75_3:
-; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr18
+; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr58
+; SI-NEXT:    ; implicit-def: $sgpr56
+; SI-NEXT:    ; implicit-def: $sgpr94
+; SI-NEXT:    ; implicit-def: $sgpr93
+; SI-NEXT:    ; implicit-def: $sgpr92
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr44
+; SI-NEXT:    ; implicit-def: $sgpr42
+; SI-NEXT:    ; implicit-def: $sgpr91
+; SI-NEXT:    ; implicit-def: $sgpr90
+; SI-NEXT:    ; implicit-def: $sgpr89
 ; SI-NEXT:    ; implicit-def: $sgpr40
-; SI-NEXT:    ; implicit-def: $sgpr29
 ; SI-NEXT:    ; implicit-def: $sgpr28
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $sgpr27
 ; SI-NEXT:    ; implicit-def: $sgpr26
-; SI-NEXT:    ; implicit-def: $sgpr15
-; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    ; implicit-def: $sgpr79
+; SI-NEXT:    ; implicit-def: $sgpr78
 ; SI-NEXT:    ; implicit-def: $sgpr14
-; SI-NEXT:    ; implicit-def: $sgpr13
 ; SI-NEXT:    ; implicit-def: $sgpr12
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $sgpr11
-; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $sgpr7
+; SI-NEXT:    ; implicit-def: $sgpr77
+; SI-NEXT:    ; implicit-def: $sgpr76
+; SI-NEXT:    ; implicit-def: $sgpr75
 ; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr74
+; SI-NEXT:    ; implicit-def: $sgpr73
+; SI-NEXT:    ; implicit-def: $sgpr72
+; SI-NEXT:    ; implicit-def: $sgpr10
 ; SI-NEXT:    s_branch .LBB75_2
 ; SI-NEXT:  .LBB75_4:
-; SI-NEXT:    v_mov_b32_e32 v1, s24
+; SI-NEXT:    v_mov_b32_e32 v20, s16
+; SI-NEXT:    v_mov_b32_e32 v15, s18
+; SI-NEXT:    v_mov_b32_e32 v8, s20
 ; SI-NEXT:    v_mov_b32_e32 v3, s22
-; SI-NEXT:    v_mov_b32_e32 v5, s20
-; SI-NEXT:    v_mov_b32_e32 v7, s18
-; SI-NEXT:    v_mov_b32_e32 v9, s16
+; SI-NEXT:    v_mov_b32_e32 v1, s24
+; SI-NEXT:    v_mov_b32_e32 v28, s60
+; SI-NEXT:    v_mov_b32_e32 v27, s58
+; SI-NEXT:    v_mov_b32_e32 v26, s56
+; SI-NEXT:    v_mov_b32_e32 v25, s46
+; SI-NEXT:    v_mov_b32_e32 v24, s44
+; SI-NEXT:    v_mov_b32_e32 v23, s42
+; SI-NEXT:    v_mov_b32_e32 v22, s40
+; SI-NEXT:    v_mov_b32_e32 v18, s28
+; SI-NEXT:    v_mov_b32_e32 v17, s26
+; SI-NEXT:    v_mov_b32_e32 v13, s14
+; SI-NEXT:    v_mov_b32_e32 v12, s12
+; SI-NEXT:    v_mov_b32_e32 v11, s8
+; SI-NEXT:    v_mov_b32_e32 v10, s6
+; SI-NEXT:    v_mov_b32_e32 v6, s4
+; SI-NEXT:    v_mov_b32_e32 v5, s10
 ; SI-NEXT:  .LBB75_5: ; %end
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v20
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v28
 ; SI-NEXT:    s_and_b32 s4, s17, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s40, 8
-; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 8, v20
-; SI-NEXT:    v_and_b32_e32 v19, 0xff, v19
+; SI-NEXT:    s_lshl_b32 s5, s94, 8
+; SI-NEXT:    v_or_b32_e32 v2, v2, v4
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v27
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s29, 0xff
-; SI-NEXT:    v_or_b32_e32 v9, v9, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 24, v18
+; SI-NEXT:    s_and_b32 s5, s93, 0xff
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v26
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s28, 24
-; SI-NEXT:    v_or_b32_e32 v18, v18, v19
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; SI-NEXT:    s_lshl_b32 s6, s92, 24
+; SI-NEXT:    v_or_b32_e32 v4, v7, v4
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
-; SI-NEXT:    v_or_b32_e32 v9, v9, v18
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_or_b32_e32 v2, v2, v4
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 4, v0
-; SI-NEXT:    v_mov_b32_e32 v18, s4
-; SI-NEXT:    buffer_store_dword v18, v9, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v17
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
+; SI-NEXT:    v_mov_b32_e32 v4, s4
+; SI-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v15
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v25
 ; SI-NEXT:    s_and_b32 s4, s19, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s27, 8
-; SI-NEXT:    v_or_b32_e32 v7, v7, v9
-; SI-NEXT:    v_and_b32_e32 v9, 0xff, v10
+; SI-NEXT:    s_lshl_b32 s5, s91, 8
+; SI-NEXT:    v_or_b32_e32 v2, v2, v4
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v24
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s26, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; SI-NEXT:    s_and_b32 s5, s90, 0xff
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v23
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s15, s15, 24
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; SI-NEXT:    v_or_b32_e32 v8, v8, v9
+; SI-NEXT:    s_lshl_b32 s6, s89, 24
+; SI-NEXT:    v_or_b32_e32 v4, v7, v4
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s15, s5
-; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 8, v0
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_or_b32_e32 v2, v2, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 8, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 12, v0
-; SI-NEXT:    v_mov_b32_e32 v8, s4
-; SI-NEXT:    buffer_store_dword v8, v7, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 12, v0
+; SI-NEXT:    v_mov_b32_e32 v4, s4
+; SI-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v8
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v22
 ; SI-NEXT:    s_and_b32 s4, s21, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s14, 8
-; SI-NEXT:    v_or_b32_e32 v5, v5, v7
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v15
+; SI-NEXT:    s_lshl_b32 s5, s88, 8
+; SI-NEXT:    v_or_b32_e32 v2, v2, v4
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v18
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s13, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; SI-NEXT:    s_and_b32 s5, s79, 0xff
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v17
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s12, s12, 24
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; SI-NEXT:    v_or_b32_e32 v6, v6, v7
+; SI-NEXT:    s_lshl_b32 s6, s78, 24
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_or_b32_e32 v4, v7, v4
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s12, s5
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 16, v0
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_or_b32_e32 v2, v2, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 16, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 20, v0
-; SI-NEXT:    v_mov_b32_e32 v6, s4
-; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v14
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 20, v0
+; SI-NEXT:    v_mov_b32_e32 v4, s4
+; SI-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v13
 ; SI-NEXT:    s_and_b32 s4, s23, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s11, 8
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v13
+; SI-NEXT:    s_lshl_b32 s5, s77, 8
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v12
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s10, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; SI-NEXT:    s_and_b32 s5, s76, 0xff
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v11
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s9, s9, 24
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v4, v4, v5
+; SI-NEXT:    s_lshl_b32 s6, s75, 24
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_or_b32_e32 v3, v4, v3
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s9, s5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v4
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 24, v0
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
-; SI-NEXT:    v_mov_b32_e32 v4, s4
-; SI-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
+; SI-NEXT:    v_mov_b32_e32 v3, s4
+; SI-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v12
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v10
 ; SI-NEXT:    s_and_b32 s4, s25, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s8, 8
-; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v11
+; SI-NEXT:    s_lshl_b32 s5, s74, 8
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v6
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s7, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; SI-NEXT:    s_and_b32 s5, s73, 0xff
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v5
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s6, s6, 24
+; SI-NEXT:    s_lshl_b32 s6, s72, 24
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s6, s5
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -35291,241 +35347,239 @@ define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 in
 ; SI-NEXT:    s_cmp_lg_u32 s26, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB79_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v3, s24
-; SI-NEXT:    v_mov_b32_e32 v6, s22
-; SI-NEXT:    v_mov_b32_e32 v9, s20
-; SI-NEXT:    v_mov_b32_e32 v12, s18
-; SI-NEXT:    v_mov_b32_e32 v15, s16
-; SI-NEXT:    v_alignbit_b32 v1, s25, v3, 24
-; SI-NEXT:    v_alignbit_b32 v2, s25, v3, 16
-; SI-NEXT:    v_alignbit_b32 v3, s25, v3, 8
-; SI-NEXT:    v_alignbit_b32 v4, s23, v6, 24
-; SI-NEXT:    v_alignbit_b32 v5, s23, v6, 16
-; SI-NEXT:    v_alignbit_b32 v6, s23, v6, 8
-; SI-NEXT:    v_alignbit_b32 v7, s21, v9, 24
-; SI-NEXT:    v_alignbit_b32 v8, s21, v9, 16
-; SI-NEXT:    v_alignbit_b32 v9, s21, v9, 8
-; SI-NEXT:    v_alignbit_b32 v10, s19, v12, 24
-; SI-NEXT:    v_alignbit_b32 v11, s19, v12, 16
-; SI-NEXT:    v_alignbit_b32 v12, s19, v12, 8
-; SI-NEXT:    v_alignbit_b32 v13, s17, v15, 24
-; SI-NEXT:    v_alignbit_b32 v14, s17, v15, 16
-; SI-NEXT:    v_alignbit_b32 v15, s17, v15, 8
-; SI-NEXT:    s_lshr_b32 s6, s25, 24
-; SI-NEXT:    s_lshr_b32 s7, s25, 16
-; SI-NEXT:    s_lshr_b32 s8, s25, 8
-; SI-NEXT:    s_lshr_b32 s9, s23, 24
-; SI-NEXT:    s_lshr_b32 s10, s23, 16
-; SI-NEXT:    s_lshr_b32 s11, s23, 8
-; SI-NEXT:    s_lshr_b32 s12, s21, 24
-; SI-NEXT:    s_lshr_b32 s13, s21, 16
-; SI-NEXT:    s_lshr_b32 s14, s21, 8
-; SI-NEXT:    s_lshr_b32 s15, s19, 24
-; SI-NEXT:    s_lshr_b32 s26, s19, 16
-; SI-NEXT:    s_lshr_b32 s27, s19, 8
-; SI-NEXT:    s_lshr_b32 s28, s17, 24
-; SI-NEXT:    s_lshr_b32 s29, s17, 16
-; SI-NEXT:    s_lshr_b32 s40, s17, 8
+; SI-NEXT:    s_lshr_b32 s72, s25, 24
+; SI-NEXT:    s_lshr_b32 s73, s25, 16
+; SI-NEXT:    s_lshr_b32 s74, s25, 8
+; SI-NEXT:    s_lshr_b32 s75, s23, 24
+; SI-NEXT:    s_lshr_b32 s76, s23, 16
+; SI-NEXT:    s_lshr_b32 s77, s23, 8
+; SI-NEXT:    s_lshr_b32 s78, s21, 24
+; SI-NEXT:    s_lshr_b32 s79, s21, 16
+; SI-NEXT:    s_lshr_b32 s88, s21, 8
+; SI-NEXT:    s_lshr_b32 s89, s19, 24
+; SI-NEXT:    s_lshr_b32 s90, s19, 16
+; SI-NEXT:    s_lshr_b32 s91, s19, 8
+; SI-NEXT:    s_lshr_b32 s92, s17, 24
+; SI-NEXT:    s_lshr_b32 s93, s17, 16
+; SI-NEXT:    s_lshr_b32 s94, s17, 8
+; SI-NEXT:    s_lshr_b64 s[4:5], s[24:25], 24
+; SI-NEXT:    s_lshr_b64 s[6:7], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[24:25], 8
+; SI-NEXT:    s_lshr_b64 s[10:11], s[22:23], 24
+; SI-NEXT:    s_lshr_b64 s[12:13], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[22:23], 8
+; SI-NEXT:    s_lshr_b64 s[26:27], s[20:21], 24
+; SI-NEXT:    s_lshr_b64 s[28:29], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[20:21], 8
+; SI-NEXT:    s_lshr_b64 s[42:43], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[44:45], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[56:57], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[58:59], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[16:17], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB79_3
 ; SI-NEXT:  .LBB79_2: ; %cmp.true
-; SI-NEXT:    s_add_u32 s16, s16, 3
-; SI-NEXT:    s_addc_u32 s17, s17, 0
-; SI-NEXT:    s_add_u32 s18, s18, 3
-; SI-NEXT:    s_addc_u32 s19, s19, 0
-; SI-NEXT:    s_add_u32 s20, s20, 3
-; SI-NEXT:    s_addc_u32 s21, s21, 0
-; SI-NEXT:    s_add_u32 s22, s22, 3
-; SI-NEXT:    s_addc_u32 s23, s23, 0
 ; SI-NEXT:    s_add_u32 s24, s24, 3
 ; SI-NEXT:    s_addc_u32 s25, s25, 0
-; SI-NEXT:    v_mov_b32_e32 v3, s24
-; SI-NEXT:    v_mov_b32_e32 v6, s22
-; SI-NEXT:    v_mov_b32_e32 v9, s20
-; SI-NEXT:    v_mov_b32_e32 v12, s18
-; SI-NEXT:    v_mov_b32_e32 v15, s16
-; SI-NEXT:    v_alignbit_b32 v1, s25, v3, 24
-; SI-NEXT:    v_alignbit_b32 v2, s25, v3, 16
-; SI-NEXT:    v_alignbit_b32 v3, s25, v3, 8
-; SI-NEXT:    v_alignbit_b32 v4, s23, v6, 24
-; SI-NEXT:    v_alignbit_b32 v5, s23, v6, 16
-; SI-NEXT:    v_alignbit_b32 v6, s23, v6, 8
-; SI-NEXT:    v_alignbit_b32 v7, s21, v9, 24
-; SI-NEXT:    v_alignbit_b32 v8, s21, v9, 16
-; SI-NEXT:    v_alignbit_b32 v9, s21, v9, 8
-; SI-NEXT:    v_alignbit_b32 v10, s19, v12, 24
-; SI-NEXT:    v_alignbit_b32 v11, s19, v12, 16
-; SI-NEXT:    v_alignbit_b32 v12, s19, v12, 8
-; SI-NEXT:    v_alignbit_b32 v13, s17, v15, 24
-; SI-NEXT:    v_alignbit_b32 v14, s17, v15, 16
-; SI-NEXT:    v_alignbit_b32 v15, s17, v15, 8
-; SI-NEXT:    s_lshr_b32 s6, s25, 24
-; SI-NEXT:    s_lshr_b32 s7, s25, 16
-; SI-NEXT:    s_lshr_b32 s8, s25, 8
-; SI-NEXT:    s_lshr_b32 s9, s23, 24
-; SI-NEXT:    s_lshr_b32 s10, s23, 16
-; SI-NEXT:    s_lshr_b32 s11, s23, 8
-; SI-NEXT:    s_lshr_b32 s12, s21, 24
-; SI-NEXT:    s_lshr_b32 s13, s21, 16
-; SI-NEXT:    s_lshr_b32 s14, s21, 8
-; SI-NEXT:    s_lshr_b32 s15, s19, 24
-; SI-NEXT:    s_lshr_b32 s26, s19, 16
-; SI-NEXT:    s_lshr_b32 s27, s19, 8
-; SI-NEXT:    s_lshr_b32 s28, s17, 24
-; SI-NEXT:    s_lshr_b32 s29, s17, 16
-; SI-NEXT:    s_lshr_b32 s40, s17, 8
+; SI-NEXT:    s_add_u32 s22, s22, 3
+; SI-NEXT:    s_addc_u32 s23, s23, 0
+; SI-NEXT:    s_add_u32 s20, s20, 3
+; SI-NEXT:    s_addc_u32 s21, s21, 0
+; SI-NEXT:    s_add_u32 s18, s18, 3
+; SI-NEXT:    s_addc_u32 s19, s19, 0
+; SI-NEXT:    s_add_u32 s16, s16, 3
+; SI-NEXT:    s_addc_u32 s17, s17, 0
+; SI-NEXT:    s_lshr_b32 s72, s25, 24
+; SI-NEXT:    s_lshr_b32 s73, s25, 16
+; SI-NEXT:    s_lshr_b32 s74, s25, 8
+; SI-NEXT:    s_lshr_b32 s75, s23, 24
+; SI-NEXT:    s_lshr_b32 s76, s23, 16
+; SI-NEXT:    s_lshr_b32 s77, s23, 8
+; SI-NEXT:    s_lshr_b32 s78, s21, 24
+; SI-NEXT:    s_lshr_b32 s79, s21, 16
+; SI-NEXT:    s_lshr_b32 s88, s21, 8
+; SI-NEXT:    s_lshr_b32 s89, s19, 24
+; SI-NEXT:    s_lshr_b32 s90, s19, 16
+; SI-NEXT:    s_lshr_b32 s91, s19, 8
+; SI-NEXT:    s_lshr_b32 s92, s17, 24
+; SI-NEXT:    s_lshr_b32 s93, s17, 16
+; SI-NEXT:    s_lshr_b32 s94, s17, 8
+; SI-NEXT:    s_lshr_b64 s[4:5], s[24:25], 24
+; SI-NEXT:    s_lshr_b64 s[6:7], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[24:25], 8
+; SI-NEXT:    s_lshr_b64 s[10:11], s[22:23], 24
+; SI-NEXT:    s_lshr_b64 s[12:13], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[22:23], 8
+; SI-NEXT:    s_lshr_b64 s[26:27], s[20:21], 24
+; SI-NEXT:    s_lshr_b64 s[28:29], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[20:21], 8
+; SI-NEXT:    s_lshr_b64 s[42:43], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[44:45], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[56:57], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[58:59], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[16:17], 8
 ; SI-NEXT:  .LBB79_3: ; %end
-; SI-NEXT:    s_and_b32 s4, s16, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 8, v15
-; SI-NEXT:    v_or_b32_e32 v15, s4, v15
-; SI-NEXT:    s_and_b32 s4, s17, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s40, 8
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s29, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s28, 24
-; SI-NEXT:    v_and_b32_e32 v14, 0xff, v14
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 24, v13
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_or_b32_e32 v13, v13, v14
-; SI-NEXT:    v_mov_b32_e32 v14, s4
-; SI-NEXT:    s_and_b32 s4, s18, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 8, v12
-; SI-NEXT:    v_or_b32_e32 v12, s4, v12
-; SI-NEXT:    s_and_b32 s4, s19, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s27, 8
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s26, 0xff
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s15, s15, 24
-; SI-NEXT:    v_or_b32_e32 v13, v15, v13
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; SI-NEXT:    v_or_b32_e32 v10, v10, v11
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s15, s5
-; SI-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen
+; SI-NEXT:    s_lshl_b32 s5, s60, 8
+; SI-NEXT:    s_and_b32 s7, s16, 0xff
+; SI-NEXT:    s_or_b32 s5, s7, s5
+; SI-NEXT:    s_and_b32 s7, s58, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s56, 24
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_or_b32 s7, s9, s7
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    s_and_b32 s5, s17, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s94, 8
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    s_and_b32 s7, s93, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_lshl_b32 s9, s92, 24
+; SI-NEXT:    s_or_b32 s7, s9, s7
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    s_lshl_b32 s5, s46, 8
+; SI-NEXT:    s_and_b32 s7, s18, 0xff
+; SI-NEXT:    s_or_b32 s5, s7, s5
+; SI-NEXT:    s_and_b32 s7, s44, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_lshl_b32 s9, s42, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s7, s9, s7
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 4, v0
-; SI-NEXT:    v_or_b32_e32 v10, v12, v10
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 8, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v14, v13, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v11, s4
-; SI-NEXT:    s_and_b32 s4, s20, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; SI-NEXT:    v_or_b32_e32 v9, s4, v9
-; SI-NEXT:    s_and_b32 s4, s21, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s14, 8
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s13, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s12, s12, 24
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s12, s5
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 12, v0
-; SI-NEXT:    v_or_b32_e32 v7, v9, v7
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 16, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v11, v10, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v8, s4
-; SI-NEXT:    s_and_b32 s4, s22, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
-; SI-NEXT:    v_or_b32_e32 v6, s4, v6
-; SI-NEXT:    s_and_b32 s4, s23, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s11, 8
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s10, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s9, s9, 24
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; SI-NEXT:    v_or_b32_e32 v4, v4, v5
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s9, s5
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    s_and_b32 s5, s19, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s91, 8
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    s_and_b32 s7, s90, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_lshl_b32 s9, s89, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s7, s9, s7
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 20, v0
-; SI-NEXT:    v_or_b32_e32 v4, v6, v4
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 24, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v8, v7, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v5, s4
-; SI-NEXT:    s_and_b32 s4, s24, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    s_and_b32 s5, s20, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s40, 8
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    s_and_b32 s7, s28, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_lshl_b32 s9, s26, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s7, s9, s7
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    s_and_b32 s5, s21, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s88, 8
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    s_and_b32 s7, s79, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_lshl_b32 s9, s78, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s7, s9, s7
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    s_and_b32 s5, s22, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s14, 8
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    s_and_b32 s7, s12, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_lshl_b32 s9, s10, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s7, s9, s7
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 20, v0
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    s_and_b32 s5, s23, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s77, 8
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    s_and_b32 s7, s76, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_lshl_b32 s9, s75, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s7, s9, s7
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 24, v0
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    s_and_b32 s5, s24, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s8, 8
+; SI-NEXT:    s_and_b32 s6, s6, 0xff
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    s_lshl_b32 s6, s6, 16
+; SI-NEXT:    s_lshl_b32 s4, s4, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s4
 ; SI-NEXT:    s_and_b32 s4, s25, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s8, 8
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT:    s_lshl_b32 s5, s74, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s7, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; SI-NEXT:    s_and_b32 s5, s73, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s6, s6, 24
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    s_lshl_b32 s6, s72, 24
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s6, s5
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 28, v0
-; SI-NEXT:    v_or_b32_e32 v1, v3, v1
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 32, v0
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB79_4:
-; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr58
+; SI-NEXT:    ; implicit-def: $sgpr56
+; SI-NEXT:    ; implicit-def: $sgpr94
+; SI-NEXT:    ; implicit-def: $sgpr93
+; SI-NEXT:    ; implicit-def: $sgpr92
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr44
+; SI-NEXT:    ; implicit-def: $sgpr42
+; SI-NEXT:    ; implicit-def: $sgpr91
+; SI-NEXT:    ; implicit-def: $sgpr90
+; SI-NEXT:    ; implicit-def: $sgpr89
 ; SI-NEXT:    ; implicit-def: $sgpr40
-; SI-NEXT:    ; implicit-def: $sgpr29
 ; SI-NEXT:    ; implicit-def: $sgpr28
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $sgpr27
 ; SI-NEXT:    ; implicit-def: $sgpr26
-; SI-NEXT:    ; implicit-def: $sgpr15
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    ; implicit-def: $sgpr79
+; SI-NEXT:    ; implicit-def: $sgpr78
 ; SI-NEXT:    ; implicit-def: $sgpr14
-; SI-NEXT:    ; implicit-def: $sgpr13
 ; SI-NEXT:    ; implicit-def: $sgpr12
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $sgpr11
 ; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr77
+; SI-NEXT:    ; implicit-def: $sgpr76
+; SI-NEXT:    ; implicit-def: $sgpr75
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $sgpr7
 ; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr74
+; SI-NEXT:    ; implicit-def: $sgpr73
+; SI-NEXT:    ; implicit-def: $sgpr72
+; SI-NEXT:    ; implicit-def: $sgpr4
 ; SI-NEXT:    s_branch .LBB79_2
 ;
 ; VI-LABEL: bitcast_v5i64_to_v40i8_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll
index 6fc9a35cd9ee6..f335b48ba4ae1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll
@@ -716,83 +716,77 @@ define inreg <22 x i16> @bitcast_v11i32_to_v22i16_scalar(<11 x i32> inreg %a, i3
 ; SI-NEXT:    s_cmp_lg_u32 s27, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB5_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s26
-; SI-NEXT:    v_alignbit_b32 v21, s4, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s24
-; SI-NEXT:    v_alignbit_b32 v17, s25, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v13, s23, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s25, 16
-; SI-NEXT:    s_lshr_b32 s7, s23, 16
-; SI-NEXT:    s_lshr_b32 s8, s21, 16
-; SI-NEXT:    s_lshr_b32 s9, s19, 16
-; SI-NEXT:    s_lshr_b32 s10, s17, 16
+; SI-NEXT:    s_lshr_b32 s27, s25, 16
+; SI-NEXT:    s_lshr_b32 s40, s23, 16
+; SI-NEXT:    s_lshr_b32 s41, s21, 16
+; SI-NEXT:    s_lshr_b32 s42, s19, 16
+; SI-NEXT:    s_lshr_b32 s43, s17, 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB5_3
 ; SI-NEXT:  .LBB5_2: ; %cmp.true
-; SI-NEXT:    s_add_i32 s24, s24, 3
-; SI-NEXT:    s_add_i32 s22, s22, 3
 ; SI-NEXT:    s_add_i32 s25, s25, 3
-; SI-NEXT:    v_mov_b32_e32 v0, s24
-; SI-NEXT:    s_add_i32 s20, s20, 3
-; SI-NEXT:    s_add_i32 s23, s23, 3
-; SI-NEXT:    v_alignbit_b32 v17, s25, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    s_add_i32 s18, s18, 3
-; SI-NEXT:    s_add_i32 s21, s21, 3
-; SI-NEXT:    v_alignbit_b32 v13, s23, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    s_add_i32 s16, s16, 3
-; SI-NEXT:    s_add_i32 s19, s19, 3
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
 ; SI-NEXT:    s_add_i32 s26, s26, 3
 ; SI-NEXT:    s_add_i32 s17, s17, 3
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s26
-; SI-NEXT:    v_alignbit_b32 v21, s4, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s25, 16
-; SI-NEXT:    s_lshr_b32 s7, s23, 16
-; SI-NEXT:    s_lshr_b32 s8, s21, 16
-; SI-NEXT:    s_lshr_b32 s9, s19, 16
-; SI-NEXT:    s_lshr_b32 s10, s17, 16
+; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    s_add_i32 s19, s19, 3
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_add_i32 s21, s21, 3
+; SI-NEXT:    s_add_i32 s20, s20, 3
+; SI-NEXT:    s_add_i32 s23, s23, 3
+; SI-NEXT:    s_add_i32 s22, s22, 3
+; SI-NEXT:    s_add_i32 s24, s24, 3
+; SI-NEXT:    s_lshr_b32 s27, s25, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[16:17], 16
+; SI-NEXT:    s_lshr_b32 s40, s23, 16
+; SI-NEXT:    s_lshr_b32 s41, s21, 16
+; SI-NEXT:    s_lshr_b32 s42, s19, 16
+; SI-NEXT:    s_lshr_b32 s43, s17, 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[26:27], 16
 ; SI-NEXT:  .LBB5_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
+; SI-NEXT:    v_mov_b32_e32 v1, s14
 ; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v3, s10
+; SI-NEXT:    v_mov_b32_e32 v3, s43
 ; SI-NEXT:    v_mov_b32_e32 v4, s18
+; SI-NEXT:    v_mov_b32_e32 v5, s12
 ; SI-NEXT:    v_mov_b32_e32 v6, s19
-; SI-NEXT:    v_mov_b32_e32 v7, s9
+; SI-NEXT:    v_mov_b32_e32 v7, s42
 ; SI-NEXT:    v_mov_b32_e32 v8, s20
+; SI-NEXT:    v_mov_b32_e32 v9, s8
 ; SI-NEXT:    v_mov_b32_e32 v10, s21
-; SI-NEXT:    v_mov_b32_e32 v11, s8
+; SI-NEXT:    v_mov_b32_e32 v11, s41
 ; SI-NEXT:    v_mov_b32_e32 v12, s22
+; SI-NEXT:    v_mov_b32_e32 v13, s6
 ; SI-NEXT:    v_mov_b32_e32 v14, s23
-; SI-NEXT:    v_mov_b32_e32 v15, s7
+; SI-NEXT:    v_mov_b32_e32 v15, s40
 ; SI-NEXT:    v_mov_b32_e32 v16, s24
+; SI-NEXT:    v_mov_b32_e32 v17, s4
 ; SI-NEXT:    v_mov_b32_e32 v18, s25
-; SI-NEXT:    v_mov_b32_e32 v19, s6
+; SI-NEXT:    v_mov_b32_e32 v19, s27
 ; SI-NEXT:    v_mov_b32_e32 v20, s26
+; SI-NEXT:    v_mov_b32_e32 v21, s10
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB5_4:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr43
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr42
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr17
+; SI-NEXT:    ; implicit-def: $sgpr41
 ; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr21
+; SI-NEXT:    ; implicit-def: $sgpr40
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr27
+; SI-NEXT:    ; implicit-def: $sgpr10
 ; SI-NEXT:    s_branch .LBB5_2
 ;
 ; VI-LABEL: bitcast_v11i32_to_v22i16_scalar:
@@ -2676,78 +2670,89 @@ define inreg <22 x i16> @bitcast_v11f32_to_v22i16_scalar(<11 x float> inreg %a,
 ; SI-NEXT:    s_cmp_lg_u32 s27, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s26
-; SI-NEXT:    v_alignbit_b32 v21, s4, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s24
-; SI-NEXT:    v_alignbit_b32 v17, s25, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v13, s23, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s10, s25, 16
-; SI-NEXT:    s_lshr_b32 s9, s23, 16
-; SI-NEXT:    s_lshr_b32 s8, s21, 16
-; SI-NEXT:    s_lshr_b32 s7, s19, 16
-; SI-NEXT:    s_lshr_b32 s6, s17, 16
+; SI-NEXT:    s_lshr_b32 s27, s17, 16
+; SI-NEXT:    s_lshr_b32 s43, s25, 16
+; SI-NEXT:    s_lshr_b32 s42, s23, 16
+; SI-NEXT:    s_lshr_b32 s41, s21, 16
+; SI-NEXT:    s_lshr_b32 s40, s19, 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB13_4
 ; SI-NEXT:  .LBB13_2: ; %cmp.true
 ; SI-NEXT:    v_add_f32_e64 v20, s26, 1.0
-; SI-NEXT:    v_add_f32_e64 v2, s17, 1.0
-; SI-NEXT:    v_add_f32_e64 v0, s16, 1.0
-; SI-NEXT:    v_add_f32_e64 v6, s19, 1.0
-; SI-NEXT:    v_add_f32_e64 v4, s18, 1.0
-; SI-NEXT:    v_add_f32_e64 v10, s21, 1.0
-; SI-NEXT:    v_add_f32_e64 v8, s20, 1.0
-; SI-NEXT:    v_add_f32_e64 v14, s23, 1.0
-; SI-NEXT:    v_add_f32_e64 v12, s22, 1.0
-; SI-NEXT:    v_add_f32_e64 v18, s25, 1.0
-; SI-NEXT:    v_add_f32_e64 v16, s24, 1.0
-; SI-NEXT:    v_alignbit_b32 v17, v18, v16, 16
-; SI-NEXT:    v_alignbit_b32 v13, v14, v12, 16
-; SI-NEXT:    v_alignbit_b32 v9, v10, v8, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v4, 16
-; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; SI-NEXT:    v_alignbit_b32 v21, s4, v20, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-NEXT:    s_setpc_b64 s[30:31]
+; SI-NEXT:    v_add_f32_e64 v32, s17, 1.0
+; SI-NEXT:    v_add_f32_e64 v31, s16, 1.0
+; SI-NEXT:    v_add_f32_e64 v30, s19, 1.0
+; SI-NEXT:    v_add_f32_e64 v29, s18, 1.0
+; SI-NEXT:    v_add_f32_e64 v28, s21, 1.0
+; SI-NEXT:    v_add_f32_e64 v27, s20, 1.0
+; SI-NEXT:    v_add_f32_e64 v26, s23, 1.0
+; SI-NEXT:    v_add_f32_e64 v25, s22, 1.0
+; SI-NEXT:    v_add_f32_e64 v24, s25, 1.0
+; SI-NEXT:    v_add_f32_e64 v23, s24, 1.0
+; SI-NEXT:    v_lshr_b64 v[17:18], v[23:24], 16
+; SI-NEXT:    v_lshr_b64 v[13:14], v[25:26], 16
+; SI-NEXT:    v_lshr_b64 v[9:10], v[27:28], 16
+; SI-NEXT:    v_lshr_b64 v[5:6], v[29:30], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[31:32], 16
+; SI-NEXT:    v_lshr_b64 v[21:22], v[20:21], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v28
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v30
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v32
+; SI-NEXT:    s_branch .LBB13_5
 ; SI-NEXT:  .LBB13_3:
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr27
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr40
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr41
 ; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $sgpr42
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr43
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $vgpr21
 ; SI-NEXT:    s_branch .LBB13_2
 ; SI-NEXT:  .LBB13_4:
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v4, s18
-; SI-NEXT:    v_mov_b32_e32 v6, s19
-; SI-NEXT:    v_mov_b32_e32 v8, s20
-; SI-NEXT:    v_mov_b32_e32 v10, s21
-; SI-NEXT:    v_mov_b32_e32 v12, s22
-; SI-NEXT:    v_mov_b32_e32 v14, s23
-; SI-NEXT:    v_mov_b32_e32 v16, s24
-; SI-NEXT:    v_mov_b32_e32 v18, s25
+; SI-NEXT:    v_mov_b32_e32 v31, s16
+; SI-NEXT:    v_mov_b32_e32 v32, s17
+; SI-NEXT:    v_mov_b32_e32 v29, s18
+; SI-NEXT:    v_mov_b32_e32 v30, s19
+; SI-NEXT:    v_mov_b32_e32 v27, s20
+; SI-NEXT:    v_mov_b32_e32 v28, s21
+; SI-NEXT:    v_mov_b32_e32 v25, s22
+; SI-NEXT:    v_mov_b32_e32 v26, s23
+; SI-NEXT:    v_mov_b32_e32 v23, s24
+; SI-NEXT:    v_mov_b32_e32 v24, s25
 ; SI-NEXT:    v_mov_b32_e32 v20, s26
-; SI-NEXT:    v_mov_b32_e32 v3, s6
-; SI-NEXT:    v_mov_b32_e32 v7, s7
-; SI-NEXT:    v_mov_b32_e32 v11, s8
-; SI-NEXT:    v_mov_b32_e32 v15, s9
-; SI-NEXT:    v_mov_b32_e32 v19, s10
+; SI-NEXT:    v_mov_b32_e32 v3, s27
+; SI-NEXT:    v_mov_b32_e32 v7, s40
+; SI-NEXT:    v_mov_b32_e32 v11, s41
+; SI-NEXT:    v_mov_b32_e32 v15, s42
+; SI-NEXT:    v_mov_b32_e32 v19, s43
+; SI-NEXT:    v_mov_b32_e32 v21, s8
+; SI-NEXT:    v_mov_b32_e32 v1, s14
+; SI-NEXT:    v_mov_b32_e32 v5, s12
+; SI-NEXT:    v_mov_b32_e32 v9, s10
+; SI-NEXT:    v_mov_b32_e32 v13, s6
+; SI-NEXT:    v_mov_b32_e32 v17, s4
+; SI-NEXT:  .LBB13_5: ; %end
+; SI-NEXT:    v_mov_b32_e32 v0, v31
+; SI-NEXT:    v_mov_b32_e32 v2, v32
+; SI-NEXT:    v_mov_b32_e32 v4, v29
+; SI-NEXT:    v_mov_b32_e32 v6, v30
+; SI-NEXT:    v_mov_b32_e32 v8, v27
+; SI-NEXT:    v_mov_b32_e32 v10, v28
+; SI-NEXT:    v_mov_b32_e32 v12, v25
+; SI-NEXT:    v_mov_b32_e32 v14, v26
+; SI-NEXT:    v_mov_b32_e32 v16, v23
+; SI-NEXT:    v_mov_b32_e32 v18, v24
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v11f32_to_v22i16_scalar:
@@ -5293,136 +5298,137 @@ define inreg <22 x i16> @bitcast_v22f16_to_v22i16_scalar(<22 x half> inreg %a, i
 ; SI-LABEL: bitcast_v22f16_to_v22i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v12, v7
-; SI-NEXT:    v_mov_b32_e32 v13, v6
-; SI-NEXT:    v_mov_b32_e32 v19, v5
-; SI-NEXT:    v_mov_b32_e32 v18, v4
-; SI-NEXT:    v_mov_b32_e32 v17, v3
-; SI-NEXT:    v_mov_b32_e32 v16, v2
-; SI-NEXT:    v_mov_b32_e32 v15, v1
+; SI-NEXT:    v_mov_b32_e32 v9, v7
+; SI-NEXT:    v_mov_b32_e32 v10, v4
+; SI-NEXT:    v_mov_b32_e32 v13, v3
 ; SI-NEXT:    v_mov_b32_e32 v14, v0
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, s16
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, s17
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v23, s17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, s19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, s21
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, s22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s24
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, s25
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, s27
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v20, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, s28
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, s29
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v27, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v21, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, s22
+; SI-NEXT:    v_cvt_f16_f32_e32 v25, s25
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, s26
+; SI-NEXT:    v_cvt_f16_f32_e32 v26, s29
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_cbranch_scc0 .LBB23_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_cbranch_execnz .LBB23_3
 ; SI-NEXT:  .LBB23_2: ; %cmp.true
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
-; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
-; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
-; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
+; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
+; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
 ; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v15
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
 ; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
-; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v23
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_or_b32_e32 v2, v2, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v24
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; SI-NEXT:    v_or_b32_e32 v6, v6, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v25
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
+; SI-NEXT:    v_or_b32_e32 v10, v10, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v26
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_or_b32_e32 v14, v14, v17
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
 ; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
-; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
-; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_or_b32_e32 v20, v20, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
 ; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
+; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
 ; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
-; SI-NEXT:    v_or_b32_e32 v18, v18, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
 ; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
 ; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
 ; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
-; SI-NEXT:    v_or_b32_e32 v14, v14, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v11
+; SI-NEXT:    v_lshr_b64 v[23:24], v[1:2], 16
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; SI-NEXT:    v_or_b32_e32 v10, v10, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v7
-; SI-NEXT:    v_or_b32_e32 v6, v6, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v3
-; SI-NEXT:    v_or_b32_e32 v2, v2, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT:    v_lshr_b64 v[24:25], v[5:6], 16
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
+; SI-NEXT:    v_lshr_b64 v[25:26], v[9:10], 16
+; SI-NEXT:    v_or_b32_e32 v18, v18, v22
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; SI-NEXT:    v_lshr_b64 v[26:27], v[13:14], 16
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
+; SI-NEXT:    v_lshr_b64 v[27:28], v[17:18], 16
+; SI-NEXT:    v_or_b32_e32 v20, v20, v22
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v5
 ; SI-NEXT:    v_or_b32_e32 v8, v8, v9
 ; SI-NEXT:    v_or_b32_e32 v12, v12, v13
 ; SI-NEXT:    v_or_b32_e32 v16, v16, v17
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v9, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v13, v14, v13, 16
-; SI-NEXT:    v_alignbit_b32 v17, v18, v17, 16
 ; SI-NEXT:  .LBB23_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v1, v23
+; SI-NEXT:    v_mov_b32_e32 v5, v24
+; SI-NEXT:    v_mov_b32_e32 v9, v25
+; SI-NEXT:    v_mov_b32_e32 v13, v26
+; SI-NEXT:    v_mov_b32_e32 v17, v27
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB23_4:
 ; SI-NEXT:    s_branch .LBB23_2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll
index c9860dbb7d72c..2cde373ec130c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll
@@ -1824,89 +1824,83 @@ define inreg <24 x i16> @bitcast_v12i32_to_v24i16_scalar(<12 x i32> inreg %a, i3
 ; SI-NEXT:    s_cmp_lg_u32 s28, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s26
-; SI-NEXT:    v_alignbit_b32 v21, s27, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s24
-; SI-NEXT:    v_alignbit_b32 v17, s25, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v13, s23, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s27, 16
-; SI-NEXT:    s_lshr_b32 s7, s25, 16
-; SI-NEXT:    s_lshr_b32 s8, s23, 16
-; SI-NEXT:    s_lshr_b32 s9, s21, 16
-; SI-NEXT:    s_lshr_b32 s10, s19, 16
-; SI-NEXT:    s_lshr_b32 s11, s17, 16
+; SI-NEXT:    s_lshr_b32 s40, s27, 16
+; SI-NEXT:    s_lshr_b32 s41, s25, 16
+; SI-NEXT:    s_lshr_b32 s42, s23, 16
+; SI-NEXT:    s_lshr_b32 s43, s21, 16
+; SI-NEXT:    s_lshr_b32 s44, s19, 16
+; SI-NEXT:    s_lshr_b32 s45, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB13_3
 ; SI-NEXT:  .LBB13_2: ; %cmp.true
-; SI-NEXT:    s_add_i32 s26, s26, 3
-; SI-NEXT:    s_add_i32 s24, s24, 3
-; SI-NEXT:    s_add_i32 s27, s27, 3
-; SI-NEXT:    v_mov_b32_e32 v0, s26
-; SI-NEXT:    s_add_i32 s22, s22, 3
-; SI-NEXT:    s_add_i32 s25, s25, 3
-; SI-NEXT:    v_alignbit_b32 v21, s27, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s24
-; SI-NEXT:    s_add_i32 s20, s20, 3
-; SI-NEXT:    s_add_i32 s23, s23, 3
-; SI-NEXT:    v_alignbit_b32 v17, s25, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    s_add_i32 s18, s18, 3
-; SI-NEXT:    s_add_i32 s21, s21, 3
-; SI-NEXT:    v_alignbit_b32 v13, s23, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
+; SI-NEXT:    s_add_i32 s17, s17, 3
 ; SI-NEXT:    s_add_i32 s16, s16, 3
 ; SI-NEXT:    s_add_i32 s19, s19, 3
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    s_add_i32 s17, s17, 3
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s27, 16
-; SI-NEXT:    s_lshr_b32 s7, s25, 16
-; SI-NEXT:    s_lshr_b32 s8, s23, 16
-; SI-NEXT:    s_lshr_b32 s9, s21, 16
-; SI-NEXT:    s_lshr_b32 s10, s19, 16
-; SI-NEXT:    s_lshr_b32 s11, s17, 16
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_add_i32 s21, s21, 3
+; SI-NEXT:    s_add_i32 s20, s20, 3
+; SI-NEXT:    s_add_i32 s23, s23, 3
+; SI-NEXT:    s_add_i32 s22, s22, 3
+; SI-NEXT:    s_add_i32 s25, s25, 3
+; SI-NEXT:    s_add_i32 s24, s24, 3
+; SI-NEXT:    s_add_i32 s27, s27, 3
+; SI-NEXT:    s_add_i32 s26, s26, 3
+; SI-NEXT:    s_lshr_b64 s[4:5], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[16:17], 16
+; SI-NEXT:    s_lshr_b32 s40, s27, 16
+; SI-NEXT:    s_lshr_b32 s41, s25, 16
+; SI-NEXT:    s_lshr_b32 s42, s23, 16
+; SI-NEXT:    s_lshr_b32 s43, s21, 16
+; SI-NEXT:    s_lshr_b32 s44, s19, 16
+; SI-NEXT:    s_lshr_b32 s45, s17, 16
 ; SI-NEXT:  .LBB13_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
+; SI-NEXT:    v_mov_b32_e32 v1, s14
 ; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v3, s11
+; SI-NEXT:    v_mov_b32_e32 v3, s45
 ; SI-NEXT:    v_mov_b32_e32 v4, s18
+; SI-NEXT:    v_mov_b32_e32 v5, s12
 ; SI-NEXT:    v_mov_b32_e32 v6, s19
-; SI-NEXT:    v_mov_b32_e32 v7, s10
+; SI-NEXT:    v_mov_b32_e32 v7, s44
 ; SI-NEXT:    v_mov_b32_e32 v8, s20
+; SI-NEXT:    v_mov_b32_e32 v9, s10
 ; SI-NEXT:    v_mov_b32_e32 v10, s21
-; SI-NEXT:    v_mov_b32_e32 v11, s9
+; SI-NEXT:    v_mov_b32_e32 v11, s43
 ; SI-NEXT:    v_mov_b32_e32 v12, s22
+; SI-NEXT:    v_mov_b32_e32 v13, s8
 ; SI-NEXT:    v_mov_b32_e32 v14, s23
-; SI-NEXT:    v_mov_b32_e32 v15, s8
+; SI-NEXT:    v_mov_b32_e32 v15, s42
 ; SI-NEXT:    v_mov_b32_e32 v16, s24
+; SI-NEXT:    v_mov_b32_e32 v17, s6
 ; SI-NEXT:    v_mov_b32_e32 v18, s25
-; SI-NEXT:    v_mov_b32_e32 v19, s7
+; SI-NEXT:    v_mov_b32_e32 v19, s41
 ; SI-NEXT:    v_mov_b32_e32 v20, s26
+; SI-NEXT:    v_mov_b32_e32 v21, s4
 ; SI-NEXT:    v_mov_b32_e32 v22, s27
-; SI-NEXT:    v_mov_b32_e32 v23, s6
+; SI-NEXT:    v_mov_b32_e32 v23, s40
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB13_4:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr11
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr45
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr44
 ; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $sgpr43
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr21
+; SI-NEXT:    ; implicit-def: $sgpr42
 ; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr41
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr40
 ; SI-NEXT:    s_branch .LBB13_2
 ;
 ; VI-LABEL: bitcast_v12i32_to_v24i16_scalar:
@@ -5008,84 +5002,97 @@ define inreg <24 x i16> @bitcast_v12f32_to_v24i16_scalar(<12 x float> inreg %a,
 ; SI-NEXT:    s_cmp_lg_u32 s28, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB29_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s26
-; SI-NEXT:    v_alignbit_b32 v21, s27, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s24
-; SI-NEXT:    v_alignbit_b32 v17, s25, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v13, s23, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s11, s27, 16
-; SI-NEXT:    s_lshr_b32 s10, s25, 16
-; SI-NEXT:    s_lshr_b32 s9, s23, 16
-; SI-NEXT:    s_lshr_b32 s8, s21, 16
-; SI-NEXT:    s_lshr_b32 s7, s19, 16
-; SI-NEXT:    s_lshr_b32 s6, s17, 16
+; SI-NEXT:    s_lshr_b32 s45, s27, 16
+; SI-NEXT:    s_lshr_b32 s44, s25, 16
+; SI-NEXT:    s_lshr_b32 s43, s23, 16
+; SI-NEXT:    s_lshr_b32 s42, s21, 16
+; SI-NEXT:    s_lshr_b32 s41, s19, 16
+; SI-NEXT:    s_lshr_b32 s40, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB29_4
 ; SI-NEXT:  .LBB29_2: ; %cmp.true
-; SI-NEXT:    v_add_f32_e64 v2, s17, 1.0
-; SI-NEXT:    v_add_f32_e64 v0, s16, 1.0
-; SI-NEXT:    v_add_f32_e64 v6, s19, 1.0
-; SI-NEXT:    v_add_f32_e64 v4, s18, 1.0
-; SI-NEXT:    v_add_f32_e64 v10, s21, 1.0
-; SI-NEXT:    v_add_f32_e64 v8, s20, 1.0
-; SI-NEXT:    v_add_f32_e64 v14, s23, 1.0
-; SI-NEXT:    v_add_f32_e64 v12, s22, 1.0
-; SI-NEXT:    v_add_f32_e64 v18, s25, 1.0
-; SI-NEXT:    v_add_f32_e64 v16, s24, 1.0
-; SI-NEXT:    v_add_f32_e64 v22, s27, 1.0
-; SI-NEXT:    v_add_f32_e64 v20, s26, 1.0
-; SI-NEXT:    v_alignbit_b32 v21, v22, v20, 16
-; SI-NEXT:    v_alignbit_b32 v17, v18, v16, 16
-; SI-NEXT:    v_alignbit_b32 v13, v14, v12, 16
-; SI-NEXT:    v_alignbit_b32 v9, v10, v8, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v4, 16
-; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-NEXT:    s_setpc_b64 s[30:31]
+; SI-NEXT:    v_add_f32_e64 v35, s17, 1.0
+; SI-NEXT:    v_add_f32_e64 v34, s16, 1.0
+; SI-NEXT:    v_add_f32_e64 v33, s19, 1.0
+; SI-NEXT:    v_add_f32_e64 v32, s18, 1.0
+; SI-NEXT:    v_add_f32_e64 v31, s21, 1.0
+; SI-NEXT:    v_add_f32_e64 v30, s20, 1.0
+; SI-NEXT:    v_add_f32_e64 v29, s23, 1.0
+; SI-NEXT:    v_add_f32_e64 v28, s22, 1.0
+; SI-NEXT:    v_add_f32_e64 v27, s25, 1.0
+; SI-NEXT:    v_add_f32_e64 v26, s24, 1.0
+; SI-NEXT:    v_add_f32_e64 v25, s27, 1.0
+; SI-NEXT:    v_add_f32_e64 v24, s26, 1.0
+; SI-NEXT:    v_lshr_b64 v[21:22], v[24:25], 16
+; SI-NEXT:    v_lshr_b64 v[17:18], v[26:27], 16
+; SI-NEXT:    v_lshr_b64 v[13:14], v[28:29], 16
+; SI-NEXT:    v_lshr_b64 v[9:10], v[30:31], 16
+; SI-NEXT:    v_lshr_b64 v[5:6], v[32:33], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[34:35], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v25
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v27
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v33
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v35
+; SI-NEXT:    s_branch .LBB29_5
 ; SI-NEXT:  .LBB29_3:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $vgpr17
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr40
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr41
 ; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $sgpr11
+; SI-NEXT:    ; implicit-def: $sgpr42
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr43
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr44
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr45
 ; SI-NEXT:    s_branch .LBB29_2
 ; SI-NEXT:  .LBB29_4:
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v4, s18
-; SI-NEXT:    v_mov_b32_e32 v6, s19
-; SI-NEXT:    v_mov_b32_e32 v8, s20
-; SI-NEXT:    v_mov_b32_e32 v10, s21
-; SI-NEXT:    v_mov_b32_e32 v12, s22
-; SI-NEXT:    v_mov_b32_e32 v14, s23
-; SI-NEXT:    v_mov_b32_e32 v16, s24
-; SI-NEXT:    v_mov_b32_e32 v18, s25
-; SI-NEXT:    v_mov_b32_e32 v20, s26
-; SI-NEXT:    v_mov_b32_e32 v22, s27
-; SI-NEXT:    v_mov_b32_e32 v3, s6
-; SI-NEXT:    v_mov_b32_e32 v7, s7
-; SI-NEXT:    v_mov_b32_e32 v11, s8
-; SI-NEXT:    v_mov_b32_e32 v15, s9
-; SI-NEXT:    v_mov_b32_e32 v19, s10
-; SI-NEXT:    v_mov_b32_e32 v23, s11
+; SI-NEXT:    v_mov_b32_e32 v34, s16
+; SI-NEXT:    v_mov_b32_e32 v35, s17
+; SI-NEXT:    v_mov_b32_e32 v32, s18
+; SI-NEXT:    v_mov_b32_e32 v33, s19
+; SI-NEXT:    v_mov_b32_e32 v30, s20
+; SI-NEXT:    v_mov_b32_e32 v31, s21
+; SI-NEXT:    v_mov_b32_e32 v28, s22
+; SI-NEXT:    v_mov_b32_e32 v29, s23
+; SI-NEXT:    v_mov_b32_e32 v26, s24
+; SI-NEXT:    v_mov_b32_e32 v27, s25
+; SI-NEXT:    v_mov_b32_e32 v24, s26
+; SI-NEXT:    v_mov_b32_e32 v25, s27
+; SI-NEXT:    v_mov_b32_e32 v3, s40
+; SI-NEXT:    v_mov_b32_e32 v7, s41
+; SI-NEXT:    v_mov_b32_e32 v11, s42
+; SI-NEXT:    v_mov_b32_e32 v15, s43
+; SI-NEXT:    v_mov_b32_e32 v19, s44
+; SI-NEXT:    v_mov_b32_e32 v23, s45
+; SI-NEXT:    v_mov_b32_e32 v1, s14
+; SI-NEXT:    v_mov_b32_e32 v5, s12
+; SI-NEXT:    v_mov_b32_e32 v9, s10
+; SI-NEXT:    v_mov_b32_e32 v13, s8
+; SI-NEXT:    v_mov_b32_e32 v17, s6
+; SI-NEXT:    v_mov_b32_e32 v21, s4
+; SI-NEXT:  .LBB29_5: ; %end
+; SI-NEXT:    v_mov_b32_e32 v0, v34
+; SI-NEXT:    v_mov_b32_e32 v2, v35
+; SI-NEXT:    v_mov_b32_e32 v4, v32
+; SI-NEXT:    v_mov_b32_e32 v6, v33
+; SI-NEXT:    v_mov_b32_e32 v8, v30
+; SI-NEXT:    v_mov_b32_e32 v10, v31
+; SI-NEXT:    v_mov_b32_e32 v12, v28
+; SI-NEXT:    v_mov_b32_e32 v14, v29
+; SI-NEXT:    v_mov_b32_e32 v16, v26
+; SI-NEXT:    v_mov_b32_e32 v18, v27
+; SI-NEXT:    v_mov_b32_e32 v20, v24
+; SI-NEXT:    v_mov_b32_e32 v22, v25
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v12f32_to_v24i16_scalar:
@@ -7630,91 +7637,91 @@ define inreg <24 x i16> @bitcast_v6f64_to_v24i16_scalar(<6 x double> inreg %a, i
 ; SI-NEXT:    s_cmp_lg_u32 s28, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB41_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s26
-; SI-NEXT:    v_alignbit_b32 v24, s27, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s24
-; SI-NEXT:    v_alignbit_b32 v25, s25, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v26, s23, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v27, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v28, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v29, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s11, s27, 16
-; SI-NEXT:    s_lshr_b32 s10, s25, 16
-; SI-NEXT:    s_lshr_b32 s9, s23, 16
-; SI-NEXT:    s_lshr_b32 s8, s21, 16
-; SI-NEXT:    s_lshr_b32 s7, s19, 16
-; SI-NEXT:    s_lshr_b32 s6, s17, 16
+; SI-NEXT:    s_lshr_b32 s45, s27, 16
+; SI-NEXT:    s_lshr_b32 s44, s25, 16
+; SI-NEXT:    s_lshr_b32 s43, s23, 16
+; SI-NEXT:    s_lshr_b32 s42, s21, 16
+; SI-NEXT:    s_lshr_b32 s41, s19, 16
+; SI-NEXT:    s_lshr_b32 s40, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB41_4
 ; SI-NEXT:  .LBB41_2: ; %cmp.true
-; SI-NEXT:    v_add_f64 v[0:1], s[16:17], 1.0
-; SI-NEXT:    v_add_f64 v[4:5], s[18:19], 1.0
-; SI-NEXT:    v_add_f64 v[8:9], s[20:21], 1.0
-; SI-NEXT:    v_add_f64 v[12:13], s[22:23], 1.0
-; SI-NEXT:    v_add_f64 v[20:21], s[26:27], 1.0
-; SI-NEXT:    v_add_f64 v[16:17], s[24:25], 1.0
-; SI-NEXT:    v_alignbit_b32 v24, v21, v20, 16
-; SI-NEXT:    v_alignbit_b32 v25, v17, v16, 16
-; SI-NEXT:    v_alignbit_b32 v26, v13, v12, 16
-; SI-NEXT:    v_alignbit_b32 v27, v9, v8, 16
-; SI-NEXT:    v_alignbit_b32 v28, v5, v4, 16
-; SI-NEXT:    v_alignbit_b32 v29, v1, v0, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v21
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; SI-NEXT:    v_add_f64 v[24:25], s[26:27], 1.0
+; SI-NEXT:    v_add_f64 v[26:27], s[24:25], 1.0
+; SI-NEXT:    v_add_f64 v[28:29], s[22:23], 1.0
+; SI-NEXT:    v_add_f64 v[30:31], s[20:21], 1.0
+; SI-NEXT:    v_add_f64 v[32:33], s[18:19], 1.0
+; SI-NEXT:    v_add_f64 v[34:35], s[16:17], 1.0
+; SI-NEXT:    v_lshr_b64 v[21:22], v[24:25], 16
+; SI-NEXT:    v_lshr_b64 v[17:18], v[26:27], 16
+; SI-NEXT:    v_lshr_b64 v[13:14], v[28:29], 16
+; SI-NEXT:    v_lshr_b64 v[9:10], v[30:31], 16
+; SI-NEXT:    v_lshr_b64 v[5:6], v[32:33], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[34:35], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v25
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v27
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v33
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v35
 ; SI-NEXT:    s_branch .LBB41_5
 ; SI-NEXT:  .LBB41_3:
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $vgpr25
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr40
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr41
 ; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $sgpr11
+; SI-NEXT:    ; implicit-def: $sgpr42
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr43
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr44
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr45
 ; SI-NEXT:    s_branch .LBB41_2
 ; SI-NEXT:  .LBB41_4:
-; SI-NEXT:    v_mov_b32_e32 v1, s17
-; SI-NEXT:    v_mov_b32_e32 v5, s19
-; SI-NEXT:    v_mov_b32_e32 v9, s21
-; SI-NEXT:    v_mov_b32_e32 v13, s23
-; SI-NEXT:    v_mov_b32_e32 v17, s25
-; SI-NEXT:    v_mov_b32_e32 v21, s27
-; SI-NEXT:    v_mov_b32_e32 v20, s26
-; SI-NEXT:    v_mov_b32_e32 v16, s24
-; SI-NEXT:    v_mov_b32_e32 v12, s22
-; SI-NEXT:    v_mov_b32_e32 v8, s20
-; SI-NEXT:    v_mov_b32_e32 v4, s18
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_mov_b32_e32 v3, s6
-; SI-NEXT:    v_mov_b32_e32 v7, s7
-; SI-NEXT:    v_mov_b32_e32 v11, s8
-; SI-NEXT:    v_mov_b32_e32 v15, s9
-; SI-NEXT:    v_mov_b32_e32 v19, s10
-; SI-NEXT:    v_mov_b32_e32 v23, s11
+; SI-NEXT:    v_mov_b32_e32 v25, s27
+; SI-NEXT:    v_mov_b32_e32 v27, s25
+; SI-NEXT:    v_mov_b32_e32 v29, s23
+; SI-NEXT:    v_mov_b32_e32 v31, s21
+; SI-NEXT:    v_mov_b32_e32 v33, s19
+; SI-NEXT:    v_mov_b32_e32 v35, s17
+; SI-NEXT:    v_mov_b32_e32 v34, s16
+; SI-NEXT:    v_mov_b32_e32 v32, s18
+; SI-NEXT:    v_mov_b32_e32 v30, s20
+; SI-NEXT:    v_mov_b32_e32 v28, s22
+; SI-NEXT:    v_mov_b32_e32 v26, s24
+; SI-NEXT:    v_mov_b32_e32 v24, s26
+; SI-NEXT:    v_mov_b32_e32 v23, s45
+; SI-NEXT:    v_mov_b32_e32 v19, s44
+; SI-NEXT:    v_mov_b32_e32 v15, s43
+; SI-NEXT:    v_mov_b32_e32 v11, s42
+; SI-NEXT:    v_mov_b32_e32 v7, s41
+; SI-NEXT:    v_mov_b32_e32 v3, s40
+; SI-NEXT:    v_mov_b32_e32 v1, s14
+; SI-NEXT:    v_mov_b32_e32 v5, s12
+; SI-NEXT:    v_mov_b32_e32 v9, s10
+; SI-NEXT:    v_mov_b32_e32 v13, s8
+; SI-NEXT:    v_mov_b32_e32 v17, s6
+; SI-NEXT:    v_mov_b32_e32 v21, s4
 ; SI-NEXT:  .LBB41_5: ; %end
-; SI-NEXT:    v_mov_b32_e32 v2, v1
-; SI-NEXT:    v_mov_b32_e32 v6, v5
-; SI-NEXT:    v_mov_b32_e32 v10, v9
-; SI-NEXT:    v_mov_b32_e32 v14, v13
-; SI-NEXT:    v_mov_b32_e32 v18, v17
-; SI-NEXT:    v_mov_b32_e32 v22, v21
-; SI-NEXT:    v_mov_b32_e32 v1, v29
-; SI-NEXT:    v_mov_b32_e32 v5, v28
-; SI-NEXT:    v_mov_b32_e32 v9, v27
-; SI-NEXT:    v_mov_b32_e32 v13, v26
-; SI-NEXT:    v_mov_b32_e32 v17, v25
-; SI-NEXT:    v_mov_b32_e32 v21, v24
+; SI-NEXT:    v_mov_b32_e32 v0, v34
+; SI-NEXT:    v_mov_b32_e32 v2, v35
+; SI-NEXT:    v_mov_b32_e32 v4, v32
+; SI-NEXT:    v_mov_b32_e32 v6, v33
+; SI-NEXT:    v_mov_b32_e32 v8, v30
+; SI-NEXT:    v_mov_b32_e32 v10, v31
+; SI-NEXT:    v_mov_b32_e32 v12, v28
+; SI-NEXT:    v_mov_b32_e32 v14, v29
+; SI-NEXT:    v_mov_b32_e32 v16, v26
+; SI-NEXT:    v_mov_b32_e32 v18, v27
+; SI-NEXT:    v_mov_b32_e32 v20, v24
+; SI-NEXT:    v_mov_b32_e32 v22, v25
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v6f64_to_v24i16_scalar:
@@ -9690,89 +9697,83 @@ define inreg <24 x i16> @bitcast_v6i64_to_v24i16_scalar(<6 x i64> inreg %a, i32
 ; SI-NEXT:    s_cmp_lg_u32 s28, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB49_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s26
-; SI-NEXT:    v_alignbit_b32 v21, s27, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s24
-; SI-NEXT:    v_alignbit_b32 v17, s25, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v13, s23, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s27, 16
-; SI-NEXT:    s_lshr_b32 s7, s25, 16
-; SI-NEXT:    s_lshr_b32 s8, s23, 16
-; SI-NEXT:    s_lshr_b32 s9, s21, 16
-; SI-NEXT:    s_lshr_b32 s10, s19, 16
-; SI-NEXT:    s_lshr_b32 s11, s17, 16
+; SI-NEXT:    s_lshr_b32 s40, s27, 16
+; SI-NEXT:    s_lshr_b32 s41, s25, 16
+; SI-NEXT:    s_lshr_b32 s42, s23, 16
+; SI-NEXT:    s_lshr_b32 s43, s21, 16
+; SI-NEXT:    s_lshr_b32 s44, s19, 16
+; SI-NEXT:    s_lshr_b32 s45, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB49_3
 ; SI-NEXT:  .LBB49_2: ; %cmp.true
-; SI-NEXT:    s_add_u32 s16, s16, 3
-; SI-NEXT:    s_addc_u32 s17, s17, 0
-; SI-NEXT:    s_add_u32 s18, s18, 3
-; SI-NEXT:    s_addc_u32 s19, s19, 0
-; SI-NEXT:    s_add_u32 s20, s20, 3
-; SI-NEXT:    s_addc_u32 s21, s21, 0
-; SI-NEXT:    s_add_u32 s22, s22, 3
-; SI-NEXT:    s_addc_u32 s23, s23, 0
-; SI-NEXT:    s_add_u32 s24, s24, 3
-; SI-NEXT:    s_addc_u32 s25, s25, 0
 ; SI-NEXT:    s_add_u32 s26, s26, 3
 ; SI-NEXT:    s_addc_u32 s27, s27, 0
-; SI-NEXT:    v_mov_b32_e32 v0, s26
-; SI-NEXT:    v_alignbit_b32 v21, s27, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s24
-; SI-NEXT:    v_alignbit_b32 v17, s25, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v13, s23, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s27, 16
-; SI-NEXT:    s_lshr_b32 s7, s25, 16
-; SI-NEXT:    s_lshr_b32 s8, s23, 16
-; SI-NEXT:    s_lshr_b32 s9, s21, 16
-; SI-NEXT:    s_lshr_b32 s10, s19, 16
-; SI-NEXT:    s_lshr_b32 s11, s17, 16
+; SI-NEXT:    s_add_u32 s24, s24, 3
+; SI-NEXT:    s_addc_u32 s25, s25, 0
+; SI-NEXT:    s_add_u32 s22, s22, 3
+; SI-NEXT:    s_addc_u32 s23, s23, 0
+; SI-NEXT:    s_add_u32 s20, s20, 3
+; SI-NEXT:    s_addc_u32 s21, s21, 0
+; SI-NEXT:    s_add_u32 s18, s18, 3
+; SI-NEXT:    s_addc_u32 s19, s19, 0
+; SI-NEXT:    s_add_u32 s16, s16, 3
+; SI-NEXT:    s_addc_u32 s17, s17, 0
+; SI-NEXT:    s_lshr_b32 s40, s27, 16
+; SI-NEXT:    s_lshr_b32 s41, s25, 16
+; SI-NEXT:    s_lshr_b32 s42, s23, 16
+; SI-NEXT:    s_lshr_b32 s43, s21, 16
+; SI-NEXT:    s_lshr_b32 s44, s19, 16
+; SI-NEXT:    s_lshr_b32 s45, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[16:17], 16
 ; SI-NEXT:  .LBB49_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
+; SI-NEXT:    v_mov_b32_e32 v1, s14
 ; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v3, s11
+; SI-NEXT:    v_mov_b32_e32 v3, s45
 ; SI-NEXT:    v_mov_b32_e32 v4, s18
+; SI-NEXT:    v_mov_b32_e32 v5, s12
 ; SI-NEXT:    v_mov_b32_e32 v6, s19
-; SI-NEXT:    v_mov_b32_e32 v7, s10
+; SI-NEXT:    v_mov_b32_e32 v7, s44
 ; SI-NEXT:    v_mov_b32_e32 v8, s20
+; SI-NEXT:    v_mov_b32_e32 v9, s10
 ; SI-NEXT:    v_mov_b32_e32 v10, s21
-; SI-NEXT:    v_mov_b32_e32 v11, s9
+; SI-NEXT:    v_mov_b32_e32 v11, s43
 ; SI-NEXT:    v_mov_b32_e32 v12, s22
+; SI-NEXT:    v_mov_b32_e32 v13, s8
 ; SI-NEXT:    v_mov_b32_e32 v14, s23
-; SI-NEXT:    v_mov_b32_e32 v15, s8
+; SI-NEXT:    v_mov_b32_e32 v15, s42
 ; SI-NEXT:    v_mov_b32_e32 v16, s24
+; SI-NEXT:    v_mov_b32_e32 v17, s6
 ; SI-NEXT:    v_mov_b32_e32 v18, s25
-; SI-NEXT:    v_mov_b32_e32 v19, s7
+; SI-NEXT:    v_mov_b32_e32 v19, s41
 ; SI-NEXT:    v_mov_b32_e32 v20, s26
+; SI-NEXT:    v_mov_b32_e32 v21, s4
 ; SI-NEXT:    v_mov_b32_e32 v22, s27
-; SI-NEXT:    v_mov_b32_e32 v23, s6
+; SI-NEXT:    v_mov_b32_e32 v23, s40
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB49_4:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr11
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr45
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr44
 ; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $sgpr43
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr21
+; SI-NEXT:    ; implicit-def: $sgpr42
 ; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr41
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr40
 ; SI-NEXT:    s_branch .LBB49_2
 ;
 ; VI-LABEL: bitcast_v6i64_to_v24i16_scalar:
@@ -12464,149 +12465,150 @@ define inreg <24 x i16> @bitcast_v24f16_to_v24i16_scalar(<24 x half> inreg %a, i
 ; SI-LABEL: bitcast_v24f16_to_v24i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v11, v9
-; SI-NEXT:    v_mov_b32_e32 v12, v8
-; SI-NEXT:    v_mov_b32_e32 v13, v7
-; SI-NEXT:    v_mov_b32_e32 v20, v6
-; SI-NEXT:    v_mov_b32_e32 v19, v5
+; SI-NEXT:    v_mov_b32_e32 v13, v8
+; SI-NEXT:    v_mov_b32_e32 v17, v7
 ; SI-NEXT:    v_mov_b32_e32 v18, v4
-; SI-NEXT:    v_mov_b32_e32 v17, v3
-; SI-NEXT:    v_mov_b32_e32 v16, v2
-; SI-NEXT:    v_mov_b32_e32 v15, v1
+; SI-NEXT:    v_mov_b32_e32 v19, v3
 ; SI-NEXT:    v_mov_b32_e32 v14, v0
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, s16
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, s17
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, s18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, s19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, s21
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, s22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s24
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, s25
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, s27
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, s28
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v30, v19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v22, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v23, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v23, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v22, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v27, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v25, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, s22
+; SI-NEXT:    v_cvt_f16_f32_e32 v28, s25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, s26
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, s27
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, s28
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, s29
+; SI-NEXT:    v_cvt_f16_f32_e32 v29, s29
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_cbranch_scc0 .LBB59_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_cbranch_execnz .LBB59_3
 ; SI-NEXT:  .LBB59_2: ; %cmp.true
-; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
-; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
-; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; SI-NEXT:    v_add_f32_e32 v23, 0x38000000, v23
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
-; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
-; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
 ; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
-; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_or_b32_e32 v22, v22, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v19
+; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
+; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v15
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; SI-NEXT:    v_or_b32_e32 v6, v6, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v28
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
+; SI-NEXT:    v_or_b32_e32 v10, v10, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v29
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v15
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; SI-NEXT:    v_or_b32_e32 v14, v14, v17
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v30
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
+; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_or_b32_e32 v2, v2, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v25
+; SI-NEXT:    v_or_b32_e32 v18, v18, v21
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
 ; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
 ; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
+; SI-NEXT:    v_add_f32_e32 v23, 0x38000000, v23
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
 ; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
-; SI-NEXT:    v_or_b32_e32 v18, v18, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
+; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
 ; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
 ; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
 ; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
 ; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
-; SI-NEXT:    v_or_b32_e32 v14, v14, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v11
+; SI-NEXT:    v_lshr_b64 v[27:28], v[1:2], 16
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
-; SI-NEXT:    v_or_b32_e32 v10, v10, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v7
-; SI-NEXT:    v_or_b32_e32 v6, v6, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v3
-; SI-NEXT:    v_or_b32_e32 v2, v2, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; SI-NEXT:    v_lshr_b64 v[28:29], v[9:10], 16
 ; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; SI-NEXT:    v_lshr_b64 v[29:30], v[13:14], 16
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_or_b32_e32 v22, v22, v24
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; SI-NEXT:    v_lshr_b64 v[30:31], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[25:26], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[31:32], v[21:22], 16
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v5
 ; SI-NEXT:    v_or_b32_e32 v8, v8, v9
 ; SI-NEXT:    v_or_b32_e32 v12, v12, v13
 ; SI-NEXT:    v_or_b32_e32 v16, v16, v17
 ; SI-NEXT:    v_or_b32_e32 v20, v20, v21
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v9, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v13, v14, v13, 16
-; SI-NEXT:    v_alignbit_b32 v17, v18, v17, 16
-; SI-NEXT:    v_alignbit_b32 v21, v22, v21, 16
 ; SI-NEXT:  .LBB59_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v1, v27
+; SI-NEXT:    v_mov_b32_e32 v5, v25
+; SI-NEXT:    v_mov_b32_e32 v9, v28
+; SI-NEXT:    v_mov_b32_e32 v13, v29
+; SI-NEXT:    v_mov_b32_e32 v17, v30
+; SI-NEXT:    v_mov_b32_e32 v21, v31
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB59_4:
 ; SI-NEXT:    s_branch .LBB59_2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll
index eaf314d4b65dc..718851f97bade 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll
@@ -2004,102 +2004,95 @@ define inreg <28 x i16> @bitcast_v14i32_to_v28i16_scalar(<14 x i32> inreg %a, i3
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s28
-; SI-NEXT:    v_alignbit_b32 v25, s29, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s26
-; SI-NEXT:    v_alignbit_b32 v21, s27, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s24
-; SI-NEXT:    v_alignbit_b32 v17, s25, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v13, s23, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s29, 16
-; SI-NEXT:    s_lshr_b32 s7, s27, 16
-; SI-NEXT:    s_lshr_b32 s8, s25, 16
-; SI-NEXT:    s_lshr_b32 s9, s23, 16
-; SI-NEXT:    s_lshr_b32 s10, s21, 16
-; SI-NEXT:    s_lshr_b32 s11, s19, 16
-; SI-NEXT:    s_lshr_b32 s12, s17, 16
+; SI-NEXT:    s_lshr_b32 s44, s29, 16
+; SI-NEXT:    s_lshr_b32 s45, s27, 16
+; SI-NEXT:    s_lshr_b32 s46, s25, 16
+; SI-NEXT:    s_lshr_b32 s47, s23, 16
+; SI-NEXT:    s_lshr_b32 s56, s21, 16
+; SI-NEXT:    s_lshr_b32 s57, s19, 16
+; SI-NEXT:    s_lshr_b32 s58, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB13_3
 ; SI-NEXT:  .LBB13_2: ; %cmp.true
-; SI-NEXT:    s_add_i32 s28, s28, 3
-; SI-NEXT:    s_add_i32 s26, s26, 3
-; SI-NEXT:    s_add_i32 s29, s29, 3
-; SI-NEXT:    v_mov_b32_e32 v0, s28
-; SI-NEXT:    s_add_i32 s24, s24, 3
-; SI-NEXT:    s_add_i32 s27, s27, 3
-; SI-NEXT:    v_alignbit_b32 v25, s29, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s26
-; SI-NEXT:    s_add_i32 s22, s22, 3
-; SI-NEXT:    s_add_i32 s25, s25, 3
-; SI-NEXT:    v_alignbit_b32 v21, s27, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s24
-; SI-NEXT:    s_add_i32 s20, s20, 3
-; SI-NEXT:    s_add_i32 s23, s23, 3
-; SI-NEXT:    v_alignbit_b32 v17, s25, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    s_add_i32 s18, s18, 3
-; SI-NEXT:    s_add_i32 s21, s21, 3
-; SI-NEXT:    v_alignbit_b32 v13, s23, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
+; SI-NEXT:    s_add_i32 s17, s17, 3
 ; SI-NEXT:    s_add_i32 s16, s16, 3
 ; SI-NEXT:    s_add_i32 s19, s19, 3
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    s_add_i32 s17, s17, 3
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s29, 16
-; SI-NEXT:    s_lshr_b32 s7, s27, 16
-; SI-NEXT:    s_lshr_b32 s8, s25, 16
-; SI-NEXT:    s_lshr_b32 s9, s23, 16
-; SI-NEXT:    s_lshr_b32 s10, s21, 16
-; SI-NEXT:    s_lshr_b32 s11, s19, 16
-; SI-NEXT:    s_lshr_b32 s12, s17, 16
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_add_i32 s21, s21, 3
+; SI-NEXT:    s_add_i32 s20, s20, 3
+; SI-NEXT:    s_add_i32 s23, s23, 3
+; SI-NEXT:    s_add_i32 s22, s22, 3
+; SI-NEXT:    s_add_i32 s25, s25, 3
+; SI-NEXT:    s_add_i32 s24, s24, 3
+; SI-NEXT:    s_add_i32 s27, s27, 3
+; SI-NEXT:    s_add_i32 s26, s26, 3
+; SI-NEXT:    s_add_i32 s29, s29, 3
+; SI-NEXT:    s_add_i32 s28, s28, 3
+; SI-NEXT:    s_lshr_b64 s[4:5], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[16:17], 16
+; SI-NEXT:    s_lshr_b32 s44, s29, 16
+; SI-NEXT:    s_lshr_b32 s45, s27, 16
+; SI-NEXT:    s_lshr_b32 s46, s25, 16
+; SI-NEXT:    s_lshr_b32 s47, s23, 16
+; SI-NEXT:    s_lshr_b32 s56, s21, 16
+; SI-NEXT:    s_lshr_b32 s57, s19, 16
+; SI-NEXT:    s_lshr_b32 s58, s17, 16
 ; SI-NEXT:  .LBB13_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
+; SI-NEXT:    v_mov_b32_e32 v1, s40
 ; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v3, s12
+; SI-NEXT:    v_mov_b32_e32 v3, s58
 ; SI-NEXT:    v_mov_b32_e32 v4, s18
+; SI-NEXT:    v_mov_b32_e32 v5, s14
 ; SI-NEXT:    v_mov_b32_e32 v6, s19
-; SI-NEXT:    v_mov_b32_e32 v7, s11
+; SI-NEXT:    v_mov_b32_e32 v7, s57
 ; SI-NEXT:    v_mov_b32_e32 v8, s20
+; SI-NEXT:    v_mov_b32_e32 v9, s12
 ; SI-NEXT:    v_mov_b32_e32 v10, s21
-; SI-NEXT:    v_mov_b32_e32 v11, s10
+; SI-NEXT:    v_mov_b32_e32 v11, s56
 ; SI-NEXT:    v_mov_b32_e32 v12, s22
+; SI-NEXT:    v_mov_b32_e32 v13, s10
 ; SI-NEXT:    v_mov_b32_e32 v14, s23
-; SI-NEXT:    v_mov_b32_e32 v15, s9
+; SI-NEXT:    v_mov_b32_e32 v15, s47
 ; SI-NEXT:    v_mov_b32_e32 v16, s24
+; SI-NEXT:    v_mov_b32_e32 v17, s8
 ; SI-NEXT:    v_mov_b32_e32 v18, s25
-; SI-NEXT:    v_mov_b32_e32 v19, s8
+; SI-NEXT:    v_mov_b32_e32 v19, s46
 ; SI-NEXT:    v_mov_b32_e32 v20, s26
+; SI-NEXT:    v_mov_b32_e32 v21, s6
 ; SI-NEXT:    v_mov_b32_e32 v22, s27
-; SI-NEXT:    v_mov_b32_e32 v23, s7
+; SI-NEXT:    v_mov_b32_e32 v23, s45
 ; SI-NEXT:    v_mov_b32_e32 v24, s28
+; SI-NEXT:    v_mov_b32_e32 v25, s4
 ; SI-NEXT:    v_mov_b32_e32 v26, s29
-; SI-NEXT:    v_mov_b32_e32 v27, s6
+; SI-NEXT:    v_mov_b32_e32 v27, s44
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB13_4:
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr40
+; SI-NEXT:    ; implicit-def: $sgpr58
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr57
 ; SI-NEXT:    ; implicit-def: $sgpr12
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $sgpr11
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $sgpr56
 ; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $vgpr17
+; SI-NEXT:    ; implicit-def: $sgpr47
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr25
+; SI-NEXT:    ; implicit-def: $sgpr46
 ; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr45
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr44
 ; SI-NEXT:    s_branch .LBB13_2
 ;
 ; VI-LABEL: bitcast_v14i32_to_v28i16_scalar:
@@ -5543,96 +5536,111 @@ define inreg <28 x i16> @bitcast_v14f32_to_v28i16_scalar(<14 x float> inreg %a,
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_cbranch_scc0 .LBB29_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s28
-; SI-NEXT:    v_alignbit_b32 v25, s29, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s26
-; SI-NEXT:    v_alignbit_b32 v21, s27, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s24
-; SI-NEXT:    v_alignbit_b32 v17, s25, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v13, s23, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s12, s29, 16
-; SI-NEXT:    s_lshr_b32 s11, s27, 16
-; SI-NEXT:    s_lshr_b32 s10, s25, 16
-; SI-NEXT:    s_lshr_b32 s9, s23, 16
-; SI-NEXT:    s_lshr_b32 s8, s21, 16
-; SI-NEXT:    s_lshr_b32 s7, s19, 16
-; SI-NEXT:    s_lshr_b32 s6, s17, 16
+; SI-NEXT:    s_lshr_b32 s58, s29, 16
+; SI-NEXT:    s_lshr_b32 s57, s27, 16
+; SI-NEXT:    s_lshr_b32 s56, s25, 16
+; SI-NEXT:    s_lshr_b32 s47, s23, 16
+; SI-NEXT:    s_lshr_b32 s46, s21, 16
+; SI-NEXT:    s_lshr_b32 s45, s19, 16
+; SI-NEXT:    s_lshr_b32 s44, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB29_4
 ; SI-NEXT:  .LBB29_2: ; %cmp.true
-; SI-NEXT:    v_add_f32_e64 v2, s17, 1.0
-; SI-NEXT:    v_add_f32_e64 v0, s16, 1.0
-; SI-NEXT:    v_add_f32_e64 v6, s19, 1.0
-; SI-NEXT:    v_add_f32_e64 v4, s18, 1.0
-; SI-NEXT:    v_add_f32_e64 v10, s21, 1.0
-; SI-NEXT:    v_add_f32_e64 v8, s20, 1.0
-; SI-NEXT:    v_add_f32_e64 v14, s23, 1.0
-; SI-NEXT:    v_add_f32_e64 v12, s22, 1.0
-; SI-NEXT:    v_add_f32_e64 v18, s25, 1.0
-; SI-NEXT:    v_add_f32_e64 v16, s24, 1.0
-; SI-NEXT:    v_add_f32_e64 v22, s27, 1.0
-; SI-NEXT:    v_add_f32_e64 v20, s26, 1.0
-; SI-NEXT:    v_add_f32_e64 v26, s29, 1.0
-; SI-NEXT:    v_add_f32_e64 v24, s28, 1.0
-; SI-NEXT:    v_alignbit_b32 v25, v26, v24, 16
-; SI-NEXT:    v_alignbit_b32 v21, v22, v20, 16
-; SI-NEXT:    v_alignbit_b32 v17, v18, v16, 16
-; SI-NEXT:    v_alignbit_b32 v13, v14, v12, 16
-; SI-NEXT:    v_alignbit_b32 v9, v10, v8, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v4, 16
-; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-NEXT:    s_setpc_b64 s[30:31]
+; SI-NEXT:    v_add_f32_e64 v49, s17, 1.0
+; SI-NEXT:    v_add_f32_e64 v48, s16, 1.0
+; SI-NEXT:    v_add_f32_e64 v39, s19, 1.0
+; SI-NEXT:    v_add_f32_e64 v38, s18, 1.0
+; SI-NEXT:    v_add_f32_e64 v37, s21, 1.0
+; SI-NEXT:    v_add_f32_e64 v36, s20, 1.0
+; SI-NEXT:    v_add_f32_e64 v35, s23, 1.0
+; SI-NEXT:    v_add_f32_e64 v34, s22, 1.0
+; SI-NEXT:    v_add_f32_e64 v33, s25, 1.0
+; SI-NEXT:    v_add_f32_e64 v32, s24, 1.0
+; SI-NEXT:    v_add_f32_e64 v31, s27, 1.0
+; SI-NEXT:    v_add_f32_e64 v30, s26, 1.0
+; SI-NEXT:    v_add_f32_e64 v29, s29, 1.0
+; SI-NEXT:    v_add_f32_e64 v28, s28, 1.0
+; SI-NEXT:    v_lshr_b64 v[25:26], v[28:29], 16
+; SI-NEXT:    v_lshr_b64 v[21:22], v[30:31], 16
+; SI-NEXT:    v_lshr_b64 v[17:18], v[32:33], 16
+; SI-NEXT:    v_lshr_b64 v[13:14], v[34:35], 16
+; SI-NEXT:    v_lshr_b64 v[9:10], v[36:37], 16
+; SI-NEXT:    v_lshr_b64 v[5:6], v[38:39], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[48:49], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v33
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v35
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v37
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v39
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v49
+; SI-NEXT:    s_branch .LBB29_5
 ; SI-NEXT:  .LBB29_3:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $sgpr11
-; SI-NEXT:    ; implicit-def: $vgpr25
+; SI-NEXT:    ; implicit-def: $sgpr40
+; SI-NEXT:    ; implicit-def: $sgpr44
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr45
 ; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr47
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr56
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr57
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr58
 ; SI-NEXT:    s_branch .LBB29_2
 ; SI-NEXT:  .LBB29_4:
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v4, s18
-; SI-NEXT:    v_mov_b32_e32 v6, s19
-; SI-NEXT:    v_mov_b32_e32 v8, s20
-; SI-NEXT:    v_mov_b32_e32 v10, s21
-; SI-NEXT:    v_mov_b32_e32 v12, s22
-; SI-NEXT:    v_mov_b32_e32 v14, s23
-; SI-NEXT:    v_mov_b32_e32 v16, s24
-; SI-NEXT:    v_mov_b32_e32 v18, s25
-; SI-NEXT:    v_mov_b32_e32 v20, s26
-; SI-NEXT:    v_mov_b32_e32 v22, s27
-; SI-NEXT:    v_mov_b32_e32 v24, s28
-; SI-NEXT:    v_mov_b32_e32 v26, s29
-; SI-NEXT:    v_mov_b32_e32 v3, s6
-; SI-NEXT:    v_mov_b32_e32 v7, s7
-; SI-NEXT:    v_mov_b32_e32 v11, s8
-; SI-NEXT:    v_mov_b32_e32 v15, s9
-; SI-NEXT:    v_mov_b32_e32 v19, s10
-; SI-NEXT:    v_mov_b32_e32 v23, s11
-; SI-NEXT:    v_mov_b32_e32 v27, s12
+; SI-NEXT:    v_mov_b32_e32 v48, s16
+; SI-NEXT:    v_mov_b32_e32 v49, s17
+; SI-NEXT:    v_mov_b32_e32 v38, s18
+; SI-NEXT:    v_mov_b32_e32 v39, s19
+; SI-NEXT:    v_mov_b32_e32 v36, s20
+; SI-NEXT:    v_mov_b32_e32 v37, s21
+; SI-NEXT:    v_mov_b32_e32 v34, s22
+; SI-NEXT:    v_mov_b32_e32 v35, s23
+; SI-NEXT:    v_mov_b32_e32 v32, s24
+; SI-NEXT:    v_mov_b32_e32 v33, s25
+; SI-NEXT:    v_mov_b32_e32 v30, s26
+; SI-NEXT:    v_mov_b32_e32 v31, s27
+; SI-NEXT:    v_mov_b32_e32 v28, s28
+; SI-NEXT:    v_mov_b32_e32 v29, s29
+; SI-NEXT:    v_mov_b32_e32 v3, s44
+; SI-NEXT:    v_mov_b32_e32 v7, s45
+; SI-NEXT:    v_mov_b32_e32 v11, s46
+; SI-NEXT:    v_mov_b32_e32 v15, s47
+; SI-NEXT:    v_mov_b32_e32 v19, s56
+; SI-NEXT:    v_mov_b32_e32 v23, s57
+; SI-NEXT:    v_mov_b32_e32 v27, s58
+; SI-NEXT:    v_mov_b32_e32 v25, s4
+; SI-NEXT:    v_mov_b32_e32 v21, s6
+; SI-NEXT:    v_mov_b32_e32 v17, s8
+; SI-NEXT:    v_mov_b32_e32 v13, s10
+; SI-NEXT:    v_mov_b32_e32 v9, s12
+; SI-NEXT:    v_mov_b32_e32 v5, s14
+; SI-NEXT:    v_mov_b32_e32 v1, s40
+; SI-NEXT:  .LBB29_5: ; %end
+; SI-NEXT:    v_mov_b32_e32 v0, v48
+; SI-NEXT:    v_mov_b32_e32 v2, v49
+; SI-NEXT:    v_mov_b32_e32 v4, v38
+; SI-NEXT:    v_mov_b32_e32 v6, v39
+; SI-NEXT:    v_mov_b32_e32 v8, v36
+; SI-NEXT:    v_mov_b32_e32 v10, v37
+; SI-NEXT:    v_mov_b32_e32 v12, v34
+; SI-NEXT:    v_mov_b32_e32 v14, v35
+; SI-NEXT:    v_mov_b32_e32 v16, v32
+; SI-NEXT:    v_mov_b32_e32 v18, v33
+; SI-NEXT:    v_mov_b32_e32 v20, v30
+; SI-NEXT:    v_mov_b32_e32 v22, v31
+; SI-NEXT:    v_mov_b32_e32 v24, v28
+; SI-NEXT:    v_mov_b32_e32 v26, v29
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v14f32_to_v28i16_scalar:
@@ -8475,102 +8483,95 @@ define inreg <28 x i16> @bitcast_v7i64_to_v28i16_scalar(<7 x i64> inreg %a, i32
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_cbranch_scc0 .LBB41_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s28
-; SI-NEXT:    v_alignbit_b32 v25, s29, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s26
-; SI-NEXT:    v_alignbit_b32 v21, s27, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s24
-; SI-NEXT:    v_alignbit_b32 v17, s25, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v13, s23, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s29, 16
-; SI-NEXT:    s_lshr_b32 s7, s27, 16
-; SI-NEXT:    s_lshr_b32 s8, s25, 16
-; SI-NEXT:    s_lshr_b32 s9, s23, 16
-; SI-NEXT:    s_lshr_b32 s10, s21, 16
-; SI-NEXT:    s_lshr_b32 s11, s19, 16
-; SI-NEXT:    s_lshr_b32 s12, s17, 16
+; SI-NEXT:    s_lshr_b32 s44, s29, 16
+; SI-NEXT:    s_lshr_b32 s45, s27, 16
+; SI-NEXT:    s_lshr_b32 s46, s25, 16
+; SI-NEXT:    s_lshr_b32 s47, s23, 16
+; SI-NEXT:    s_lshr_b32 s56, s21, 16
+; SI-NEXT:    s_lshr_b32 s57, s19, 16
+; SI-NEXT:    s_lshr_b32 s58, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB41_3
 ; SI-NEXT:  .LBB41_2: ; %cmp.true
-; SI-NEXT:    s_add_u32 s16, s16, 3
-; SI-NEXT:    s_addc_u32 s17, s17, 0
-; SI-NEXT:    s_add_u32 s18, s18, 3
-; SI-NEXT:    s_addc_u32 s19, s19, 0
-; SI-NEXT:    s_add_u32 s20, s20, 3
-; SI-NEXT:    s_addc_u32 s21, s21, 0
-; SI-NEXT:    s_add_u32 s22, s22, 3
-; SI-NEXT:    s_addc_u32 s23, s23, 0
-; SI-NEXT:    s_add_u32 s24, s24, 3
-; SI-NEXT:    s_addc_u32 s25, s25, 0
-; SI-NEXT:    s_add_u32 s26, s26, 3
-; SI-NEXT:    s_addc_u32 s27, s27, 0
 ; SI-NEXT:    s_add_u32 s28, s28, 3
 ; SI-NEXT:    s_addc_u32 s29, s29, 0
-; SI-NEXT:    v_mov_b32_e32 v0, s28
-; SI-NEXT:    v_alignbit_b32 v25, s29, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s26
-; SI-NEXT:    v_alignbit_b32 v21, s27, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s24
-; SI-NEXT:    v_alignbit_b32 v17, s25, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v13, s23, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v9, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s29, 16
-; SI-NEXT:    s_lshr_b32 s7, s27, 16
-; SI-NEXT:    s_lshr_b32 s8, s25, 16
-; SI-NEXT:    s_lshr_b32 s9, s23, 16
-; SI-NEXT:    s_lshr_b32 s10, s21, 16
-; SI-NEXT:    s_lshr_b32 s11, s19, 16
-; SI-NEXT:    s_lshr_b32 s12, s17, 16
+; SI-NEXT:    s_add_u32 s26, s26, 3
+; SI-NEXT:    s_addc_u32 s27, s27, 0
+; SI-NEXT:    s_add_u32 s24, s24, 3
+; SI-NEXT:    s_addc_u32 s25, s25, 0
+; SI-NEXT:    s_add_u32 s22, s22, 3
+; SI-NEXT:    s_addc_u32 s23, s23, 0
+; SI-NEXT:    s_add_u32 s20, s20, 3
+; SI-NEXT:    s_addc_u32 s21, s21, 0
+; SI-NEXT:    s_add_u32 s18, s18, 3
+; SI-NEXT:    s_addc_u32 s19, s19, 0
+; SI-NEXT:    s_add_u32 s16, s16, 3
+; SI-NEXT:    s_addc_u32 s17, s17, 0
+; SI-NEXT:    s_lshr_b32 s44, s29, 16
+; SI-NEXT:    s_lshr_b32 s45, s27, 16
+; SI-NEXT:    s_lshr_b32 s46, s25, 16
+; SI-NEXT:    s_lshr_b32 s47, s23, 16
+; SI-NEXT:    s_lshr_b32 s56, s21, 16
+; SI-NEXT:    s_lshr_b32 s57, s19, 16
+; SI-NEXT:    s_lshr_b32 s58, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[16:17], 16
 ; SI-NEXT:  .LBB41_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
+; SI-NEXT:    v_mov_b32_e32 v1, s40
 ; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v3, s12
+; SI-NEXT:    v_mov_b32_e32 v3, s58
 ; SI-NEXT:    v_mov_b32_e32 v4, s18
+; SI-NEXT:    v_mov_b32_e32 v5, s14
 ; SI-NEXT:    v_mov_b32_e32 v6, s19
-; SI-NEXT:    v_mov_b32_e32 v7, s11
+; SI-NEXT:    v_mov_b32_e32 v7, s57
 ; SI-NEXT:    v_mov_b32_e32 v8, s20
+; SI-NEXT:    v_mov_b32_e32 v9, s12
 ; SI-NEXT:    v_mov_b32_e32 v10, s21
-; SI-NEXT:    v_mov_b32_e32 v11, s10
+; SI-NEXT:    v_mov_b32_e32 v11, s56
 ; SI-NEXT:    v_mov_b32_e32 v12, s22
+; SI-NEXT:    v_mov_b32_e32 v13, s10
 ; SI-NEXT:    v_mov_b32_e32 v14, s23
-; SI-NEXT:    v_mov_b32_e32 v15, s9
+; SI-NEXT:    v_mov_b32_e32 v15, s47
 ; SI-NEXT:    v_mov_b32_e32 v16, s24
+; SI-NEXT:    v_mov_b32_e32 v17, s8
 ; SI-NEXT:    v_mov_b32_e32 v18, s25
-; SI-NEXT:    v_mov_b32_e32 v19, s8
+; SI-NEXT:    v_mov_b32_e32 v19, s46
 ; SI-NEXT:    v_mov_b32_e32 v20, s26
+; SI-NEXT:    v_mov_b32_e32 v21, s6
 ; SI-NEXT:    v_mov_b32_e32 v22, s27
-; SI-NEXT:    v_mov_b32_e32 v23, s7
+; SI-NEXT:    v_mov_b32_e32 v23, s45
 ; SI-NEXT:    v_mov_b32_e32 v24, s28
+; SI-NEXT:    v_mov_b32_e32 v25, s4
 ; SI-NEXT:    v_mov_b32_e32 v26, s29
-; SI-NEXT:    v_mov_b32_e32 v27, s6
+; SI-NEXT:    v_mov_b32_e32 v27, s44
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB41_4:
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr40
+; SI-NEXT:    ; implicit-def: $sgpr58
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr57
 ; SI-NEXT:    ; implicit-def: $sgpr12
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $sgpr11
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $sgpr56
 ; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $vgpr17
+; SI-NEXT:    ; implicit-def: $sgpr47
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr25
+; SI-NEXT:    ; implicit-def: $sgpr46
 ; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr45
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr44
 ; SI-NEXT:    s_branch .LBB41_2
 ;
 ; VI-LABEL: bitcast_v7i64_to_v28i16_scalar:
@@ -10809,104 +10810,104 @@ define inreg <28 x i16> @bitcast_v7f64_to_v28i16_scalar(<7 x double> inreg %a, i
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_cbranch_scc0 .LBB49_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s28
-; SI-NEXT:    v_alignbit_b32 v28, s29, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s26
-; SI-NEXT:    v_alignbit_b32 v29, s27, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s24
-; SI-NEXT:    v_alignbit_b32 v30, s25, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s22
-; SI-NEXT:    v_alignbit_b32 v31, s23, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s20
-; SI-NEXT:    v_alignbit_b32 v32, s21, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v33, s19, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v34, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s12, s29, 16
-; SI-NEXT:    s_lshr_b32 s11, s27, 16
-; SI-NEXT:    s_lshr_b32 s10, s25, 16
-; SI-NEXT:    s_lshr_b32 s9, s23, 16
-; SI-NEXT:    s_lshr_b32 s8, s21, 16
-; SI-NEXT:    s_lshr_b32 s7, s19, 16
-; SI-NEXT:    s_lshr_b32 s6, s17, 16
+; SI-NEXT:    s_lshr_b32 s58, s29, 16
+; SI-NEXT:    s_lshr_b32 s57, s27, 16
+; SI-NEXT:    s_lshr_b32 s56, s25, 16
+; SI-NEXT:    s_lshr_b32 s47, s23, 16
+; SI-NEXT:    s_lshr_b32 s46, s21, 16
+; SI-NEXT:    s_lshr_b32 s45, s19, 16
+; SI-NEXT:    s_lshr_b32 s44, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB49_4
 ; SI-NEXT:  .LBB49_2: ; %cmp.true
-; SI-NEXT:    v_add_f64 v[0:1], s[16:17], 1.0
-; SI-NEXT:    v_add_f64 v[4:5], s[18:19], 1.0
-; SI-NEXT:    v_add_f64 v[8:9], s[20:21], 1.0
-; SI-NEXT:    v_add_f64 v[12:13], s[22:23], 1.0
-; SI-NEXT:    v_add_f64 v[16:17], s[24:25], 1.0
-; SI-NEXT:    v_add_f64 v[24:25], s[28:29], 1.0
-; SI-NEXT:    v_add_f64 v[20:21], s[26:27], 1.0
-; SI-NEXT:    v_alignbit_b32 v28, v25, v24, 16
-; SI-NEXT:    v_alignbit_b32 v29, v21, v20, 16
-; SI-NEXT:    v_alignbit_b32 v30, v17, v16, 16
-; SI-NEXT:    v_alignbit_b32 v31, v13, v12, 16
-; SI-NEXT:    v_alignbit_b32 v32, v9, v8, 16
-; SI-NEXT:    v_alignbit_b32 v33, v5, v4, 16
-; SI-NEXT:    v_alignbit_b32 v34, v1, v0, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v21
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; SI-NEXT:    v_add_f64 v[28:29], s[28:29], 1.0
+; SI-NEXT:    v_add_f64 v[30:31], s[26:27], 1.0
+; SI-NEXT:    v_add_f64 v[32:33], s[24:25], 1.0
+; SI-NEXT:    v_add_f64 v[34:35], s[22:23], 1.0
+; SI-NEXT:    v_add_f64 v[36:37], s[20:21], 1.0
+; SI-NEXT:    v_add_f64 v[38:39], s[18:19], 1.0
+; SI-NEXT:    v_add_f64 v[48:49], s[16:17], 1.0
+; SI-NEXT:    v_lshr_b64 v[25:26], v[28:29], 16
+; SI-NEXT:    v_lshr_b64 v[21:22], v[30:31], 16
+; SI-NEXT:    v_lshr_b64 v[17:18], v[32:33], 16
+; SI-NEXT:    v_lshr_b64 v[13:14], v[34:35], 16
+; SI-NEXT:    v_lshr_b64 v[9:10], v[36:37], 16
+; SI-NEXT:    v_lshr_b64 v[5:6], v[38:39], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[48:49], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v33
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v35
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v37
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v39
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v49
 ; SI-NEXT:    s_branch .LBB49_5
 ; SI-NEXT:  .LBB49_3:
-; SI-NEXT:    ; implicit-def: $vgpr34
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $sgpr11
-; SI-NEXT:    ; implicit-def: $vgpr28
+; SI-NEXT:    ; implicit-def: $sgpr40
+; SI-NEXT:    ; implicit-def: $sgpr44
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr45
 ; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr47
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr56
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr57
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr58
 ; SI-NEXT:    s_branch .LBB49_2
 ; SI-NEXT:  .LBB49_4:
-; SI-NEXT:    v_mov_b32_e32 v5, s19
-; SI-NEXT:    v_mov_b32_e32 v9, s21
-; SI-NEXT:    v_mov_b32_e32 v13, s23
-; SI-NEXT:    v_mov_b32_e32 v17, s25
-; SI-NEXT:    v_mov_b32_e32 v21, s27
-; SI-NEXT:    v_mov_b32_e32 v25, s29
-; SI-NEXT:    v_mov_b32_e32 v24, s28
-; SI-NEXT:    v_mov_b32_e32 v20, s26
-; SI-NEXT:    v_mov_b32_e32 v16, s24
-; SI-NEXT:    v_mov_b32_e32 v12, s22
-; SI-NEXT:    v_mov_b32_e32 v8, s20
-; SI-NEXT:    v_mov_b32_e32 v4, s18
-; SI-NEXT:    v_mov_b32_e32 v1, s17
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_mov_b32_e32 v3, s6
-; SI-NEXT:    v_mov_b32_e32 v7, s7
-; SI-NEXT:    v_mov_b32_e32 v11, s8
-; SI-NEXT:    v_mov_b32_e32 v15, s9
-; SI-NEXT:    v_mov_b32_e32 v19, s10
-; SI-NEXT:    v_mov_b32_e32 v23, s11
-; SI-NEXT:    v_mov_b32_e32 v27, s12
+; SI-NEXT:    v_mov_b32_e32 v29, s29
+; SI-NEXT:    v_mov_b32_e32 v31, s27
+; SI-NEXT:    v_mov_b32_e32 v33, s25
+; SI-NEXT:    v_mov_b32_e32 v35, s23
+; SI-NEXT:    v_mov_b32_e32 v37, s21
+; SI-NEXT:    v_mov_b32_e32 v39, s19
+; SI-NEXT:    v_mov_b32_e32 v49, s17
+; SI-NEXT:    v_mov_b32_e32 v48, s16
+; SI-NEXT:    v_mov_b32_e32 v38, s18
+; SI-NEXT:    v_mov_b32_e32 v36, s20
+; SI-NEXT:    v_mov_b32_e32 v34, s22
+; SI-NEXT:    v_mov_b32_e32 v32, s24
+; SI-NEXT:    v_mov_b32_e32 v30, s26
+; SI-NEXT:    v_mov_b32_e32 v28, s28
+; SI-NEXT:    v_mov_b32_e32 v27, s58
+; SI-NEXT:    v_mov_b32_e32 v23, s57
+; SI-NEXT:    v_mov_b32_e32 v19, s56
+; SI-NEXT:    v_mov_b32_e32 v15, s47
+; SI-NEXT:    v_mov_b32_e32 v11, s46
+; SI-NEXT:    v_mov_b32_e32 v7, s45
+; SI-NEXT:    v_mov_b32_e32 v3, s44
+; SI-NEXT:    v_mov_b32_e32 v1, s40
+; SI-NEXT:    v_mov_b32_e32 v5, s14
+; SI-NEXT:    v_mov_b32_e32 v9, s12
+; SI-NEXT:    v_mov_b32_e32 v13, s10
+; SI-NEXT:    v_mov_b32_e32 v17, s8
+; SI-NEXT:    v_mov_b32_e32 v21, s6
+; SI-NEXT:    v_mov_b32_e32 v25, s4
 ; SI-NEXT:  .LBB49_5: ; %end
-; SI-NEXT:    v_mov_b32_e32 v2, v1
-; SI-NEXT:    v_mov_b32_e32 v6, v5
-; SI-NEXT:    v_mov_b32_e32 v10, v9
-; SI-NEXT:    v_mov_b32_e32 v14, v13
-; SI-NEXT:    v_mov_b32_e32 v18, v17
-; SI-NEXT:    v_mov_b32_e32 v22, v21
-; SI-NEXT:    v_mov_b32_e32 v26, v25
-; SI-NEXT:    v_mov_b32_e32 v1, v34
-; SI-NEXT:    v_mov_b32_e32 v5, v33
-; SI-NEXT:    v_mov_b32_e32 v9, v32
-; SI-NEXT:    v_mov_b32_e32 v13, v31
-; SI-NEXT:    v_mov_b32_e32 v17, v30
-; SI-NEXT:    v_mov_b32_e32 v21, v29
-; SI-NEXT:    v_mov_b32_e32 v25, v28
+; SI-NEXT:    v_mov_b32_e32 v0, v48
+; SI-NEXT:    v_mov_b32_e32 v2, v49
+; SI-NEXT:    v_mov_b32_e32 v4, v38
+; SI-NEXT:    v_mov_b32_e32 v6, v39
+; SI-NEXT:    v_mov_b32_e32 v8, v36
+; SI-NEXT:    v_mov_b32_e32 v10, v37
+; SI-NEXT:    v_mov_b32_e32 v12, v34
+; SI-NEXT:    v_mov_b32_e32 v14, v35
+; SI-NEXT:    v_mov_b32_e32 v16, v32
+; SI-NEXT:    v_mov_b32_e32 v18, v33
+; SI-NEXT:    v_mov_b32_e32 v20, v30
+; SI-NEXT:    v_mov_b32_e32 v22, v31
+; SI-NEXT:    v_mov_b32_e32 v24, v28
+; SI-NEXT:    v_mov_b32_e32 v26, v29
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v7f64_to_v28i16_scalar:
@@ -13866,83 +13867,107 @@ define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i
 ; SI-LABEL: bitcast_v28f16_to_v28i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v19, v5
+; SI-NEXT:    v_mov_b32_e32 v17, v12
+; SI-NEXT:    v_mov_b32_e32 v21, v11
+; SI-NEXT:    v_mov_b32_e32 v22, v8
+; SI-NEXT:    v_mov_b32_e32 v25, v7
 ; SI-NEXT:    v_mov_b32_e32 v18, v4
-; SI-NEXT:    v_mov_b32_e32 v17, v3
-; SI-NEXT:    v_mov_b32_e32 v16, v2
-; SI-NEXT:    v_mov_b32_e32 v15, v1
-; SI-NEXT:    v_mov_b32_e32 v20, v0
+; SI-NEXT:    v_mov_b32_e32 v26, v3
+; SI-NEXT:    v_mov_b32_e32 v28, v0
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, s16
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, s17
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, s18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, s19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, s21
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v20
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, s23
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, s24
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, s27
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v22, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v23, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v24, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v26, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v27, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, s28
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v30, v26
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, v25
+; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; SI-NEXT:    v_cvt_f16_f32_e32 v37, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v26, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, s21
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s22
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, s23
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, s24
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, s25
+; SI-NEXT:    v_cvt_f16_f32_e32 v29, s25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, s26
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, s27
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, s28
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, s29
+; SI-NEXT:    v_cvt_f16_f32_e32 v35, s29
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_cbranch_scc0 .LBB59_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_cbranch_execnz .LBB59_3
 ; SI-NEXT:  .LBB59_2: ; %cmp.true
-; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
-; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
-; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
-; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
 ; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
+; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; SI-NEXT:    v_add_f32_e32 v26, 0x38000000, v26
-; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
-; SI-NEXT:    v_add_f32_e32 v23, 0x38000000, v23
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v22
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
+; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
+; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
 ; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v15
 ; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
 ; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
-; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v15
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; SI-NEXT:    v_add_f32_e32 v23, 0x38000000, v23
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
 ; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
-; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
+; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v22
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; SI-NEXT:    v_or_b32_e32 v10, v10, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v35
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; SI-NEXT:    v_or_b32_e32 v6, v6, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v29
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v15
+; SI-NEXT:    v_or_b32_e32 v18, v18, v21
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, v36
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v23
+; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; SI-NEXT:    v_or_b32_e32 v2, v2, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v32
+; SI-NEXT:    v_or_b32_e32 v14, v14, v17
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v30
+; SI-NEXT:    v_or_b32_e32 v22, v22, v25
+; SI-NEXT:    v_cvt_f32_f16_e32 v25, v37
+; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
@@ -13950,48 +13975,30 @@ define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
 ; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
-; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
-; SI-NEXT:    v_or_b32_e32 v26, v26, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v23
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
-; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_or_b32_e32 v22, v22, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
-; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
-; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
 ; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
+; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
 ; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v25
-; SI-NEXT:    v_or_b32_e32 v18, v18, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; SI-NEXT:    v_add_f32_e32 v26, 0x38000000, v26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
 ; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
 ; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
 ; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
 ; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
 ; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v24
-; SI-NEXT:    v_or_b32_e32 v14, v14, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
@@ -13999,18 +14006,20 @@ define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
-; SI-NEXT:    v_or_b32_e32 v10, v10, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v7
-; SI-NEXT:    v_or_b32_e32 v6, v6, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v3
-; SI-NEXT:    v_or_b32_e32 v2, v2, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_lshr_b64 v[34:35], v[1:2], 16
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
 ; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; SI-NEXT:    v_lshr_b64 v[35:36], v[13:14], 16
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; SI-NEXT:    v_or_b32_e32 v26, v26, v28
 ; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; SI-NEXT:    v_lshr_b64 v[29:30], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[36:37], v[21:22], 16
+; SI-NEXT:    v_lshr_b64 v[32:33], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[30:31], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[37:38], v[25:26], 16
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v5
 ; SI-NEXT:    v_or_b32_e32 v8, v8, v9
@@ -14018,14 +14027,14 @@ define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v16, v16, v17
 ; SI-NEXT:    v_or_b32_e32 v20, v20, v21
 ; SI-NEXT:    v_or_b32_e32 v24, v24, v25
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v9, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v13, v14, v13, 16
-; SI-NEXT:    v_alignbit_b32 v17, v18, v17, 16
-; SI-NEXT:    v_alignbit_b32 v21, v22, v21, 16
-; SI-NEXT:    v_alignbit_b32 v25, v26, v25, 16
 ; SI-NEXT:  .LBB59_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v1, v34
+; SI-NEXT:    v_mov_b32_e32 v5, v32
+; SI-NEXT:    v_mov_b32_e32 v9, v29
+; SI-NEXT:    v_mov_b32_e32 v13, v35
+; SI-NEXT:    v_mov_b32_e32 v17, v30
+; SI-NEXT:    v_mov_b32_e32 v21, v36
+; SI-NEXT:    v_mov_b32_e32 v25, v37
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB59_4:
 ; SI-NEXT:    s_branch .LBB59_2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
index fd190b23dd8ca..acc02472c7161 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll
@@ -903,31 +903,32 @@ define inreg <3 x i16> @bitcast_v3bf16_to_v3i16_scalar(<3 x bfloat> inreg %a, i3
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s19, 0
 ; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s18
 ; SI-NEXT:    s_cbranch_scc0 .LBB5_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
 ; SI-NEXT:    s_cbranch_execnz .LBB5_3
 ; SI-NEXT:  .LBB5_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT:    v_alignbit_b32 v0, v2, v0, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT:    v_lshr_b64 v[3:4], v[1:2], 16
+; SI-NEXT:    v_alignbit_b32 v0, v5, v0, 16
 ; SI-NEXT:  .LBB5_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v1, v3
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB5_4:
 ; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    s_branch .LBB5_2
 ;
@@ -1391,26 +1392,27 @@ define inreg <3 x i16> @bitcast_v3f16_to_v3i16_scalar(<3 x half> inreg %a, i32 i
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, s16
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, s17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, s18
 ; SI-NEXT:    s_cmp_lg_u32 s19, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB9_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_cbranch_execnz .LBB9_3
 ; SI-NEXT:  .LBB9_2: ; %cmp.true
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshr_b64 v[3:4], v[1:2], 16
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; SI-NEXT:  .LBB9_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v1, v3
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB9_4:
 ; SI-NEXT:    s_branch .LBB9_2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index ee43eff01ab60..e33493c6a760e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -2143,96 +2143,113 @@ define inreg <32 x i16> @bitcast_v16i32_to_v32i16_scalar(<16 x i32> inreg %a, i3
 ; SI-LABEL: bitcast_v16i32_to_v32i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v3, v2
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; SI-NEXT:    v_mov_b32_e32 v30, v1
-; SI-NEXT:    v_mov_b32_e32 v28, v0
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; SI-NEXT:    v_readfirstlane_b32 s4, v0
+; SI-NEXT:    s_and_b64 s[6:7], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s5, v1
+; SI-NEXT:    s_cbranch_scc0 .LBB13_4
+; SI-NEXT:  ; %bb.1: ; %cmp.false
+; SI-NEXT:    s_lshr_b32 s56, s5, 16
+; SI-NEXT:    s_lshr_b32 s57, s29, 16
+; SI-NEXT:    s_lshr_b32 s58, s27, 16
+; SI-NEXT:    s_lshr_b32 s59, s25, 16
+; SI-NEXT:    s_lshr_b32 s60, s23, 16
+; SI-NEXT:    s_lshr_b32 s61, s21, 16
+; SI-NEXT:    s_lshr_b32 s62, s19, 16
+; SI-NEXT:    s_lshr_b32 s63, s17, 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[16:17], 16
+; SI-NEXT:    s_cbranch_execnz .LBB13_3
+; SI-NEXT:  .LBB13_2: ; %cmp.true
+; SI-NEXT:    s_add_i32 s17, s17, 3
+; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    s_add_i32 s19, s19, 3
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_add_i32 s21, s21, 3
+; SI-NEXT:    s_add_i32 s20, s20, 3
+; SI-NEXT:    s_add_i32 s23, s23, 3
+; SI-NEXT:    s_add_i32 s22, s22, 3
+; SI-NEXT:    s_add_i32 s25, s25, 3
+; SI-NEXT:    s_add_i32 s24, s24, 3
+; SI-NEXT:    s_add_i32 s27, s27, 3
+; SI-NEXT:    s_add_i32 s26, s26, 3
+; SI-NEXT:    s_add_i32 s29, s29, 3
+; SI-NEXT:    s_add_i32 s28, s28, 3
+; SI-NEXT:    s_add_i32 s5, s5, 3
+; SI-NEXT:    s_add_i32 s4, s4, 3
+; SI-NEXT:    s_lshr_b64 s[6:7], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[16:17], 16
+; SI-NEXT:    s_lshr_b32 s56, s5, 16
+; SI-NEXT:    s_lshr_b32 s57, s29, 16
+; SI-NEXT:    s_lshr_b32 s58, s27, 16
+; SI-NEXT:    s_lshr_b32 s59, s25, 16
+; SI-NEXT:    s_lshr_b32 s60, s23, 16
+; SI-NEXT:    s_lshr_b32 s61, s21, 16
+; SI-NEXT:    s_lshr_b32 s62, s19, 16
+; SI-NEXT:    s_lshr_b32 s63, s17, 16
+; SI-NEXT:  .LBB13_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
+; SI-NEXT:    v_mov_b32_e32 v1, s44
 ; SI-NEXT:    v_mov_b32_e32 v2, s17
+; SI-NEXT:    v_mov_b32_e32 v3, s63
 ; SI-NEXT:    v_mov_b32_e32 v4, s18
+; SI-NEXT:    v_mov_b32_e32 v5, s42
 ; SI-NEXT:    v_mov_b32_e32 v6, s19
+; SI-NEXT:    v_mov_b32_e32 v7, s62
 ; SI-NEXT:    v_mov_b32_e32 v8, s20
+; SI-NEXT:    v_mov_b32_e32 v9, s40
 ; SI-NEXT:    v_mov_b32_e32 v10, s21
+; SI-NEXT:    v_mov_b32_e32 v11, s61
 ; SI-NEXT:    v_mov_b32_e32 v12, s22
+; SI-NEXT:    v_mov_b32_e32 v13, s14
 ; SI-NEXT:    v_mov_b32_e32 v14, s23
+; SI-NEXT:    v_mov_b32_e32 v15, s60
 ; SI-NEXT:    v_mov_b32_e32 v16, s24
+; SI-NEXT:    v_mov_b32_e32 v17, s12
 ; SI-NEXT:    v_mov_b32_e32 v18, s25
+; SI-NEXT:    v_mov_b32_e32 v19, s59
 ; SI-NEXT:    v_mov_b32_e32 v20, s26
+; SI-NEXT:    v_mov_b32_e32 v21, s10
 ; SI-NEXT:    v_mov_b32_e32 v22, s27
+; SI-NEXT:    v_mov_b32_e32 v23, s58
 ; SI-NEXT:    v_mov_b32_e32 v24, s28
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    v_mov_b32_e32 v25, s8
 ; SI-NEXT:    v_mov_b32_e32 v26, s29
-; SI-NEXT:    s_cbranch_scc0 .LBB13_4
-; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_alignbit_b32 v29, v30, v28, 16
-; SI-NEXT:    v_alignbit_b32 v25, v26, v24, 16
-; SI-NEXT:    v_alignbit_b32 v21, v22, v20, 16
-; SI-NEXT:    v_alignbit_b32 v17, v18, v16, 16
-; SI-NEXT:    v_alignbit_b32 v13, v14, v12, 16
-; SI-NEXT:    v_alignbit_b32 v9, v10, v8, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v4, 16
-; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-NEXT:    s_cbranch_execnz .LBB13_3
-; SI-NEXT:  .LBB13_2: ; %cmp.true
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 3, v6
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 3, v4
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 3, v10
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 3, v8
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 3, v14
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 3, v12
-; SI-NEXT:    v_add_i32_e32 v18, vcc, 3, v18
-; SI-NEXT:    v_add_i32_e32 v16, vcc, 3, v16
-; SI-NEXT:    v_add_i32_e32 v22, vcc, 3, v22
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 3, v20
-; SI-NEXT:    v_add_i32_e32 v26, vcc, 3, v26
-; SI-NEXT:    v_add_i32_e32 v24, vcc, 3, v24
-; SI-NEXT:    v_add_i32_e32 v30, vcc, 3, v30
-; SI-NEXT:    v_add_i32_e32 v28, vcc, 3, v28
-; SI-NEXT:    v_alignbit_b32 v29, v30, v28, 16
-; SI-NEXT:    v_alignbit_b32 v25, v26, v24, 16
-; SI-NEXT:    v_alignbit_b32 v21, v22, v20, 16
-; SI-NEXT:    v_alignbit_b32 v17, v18, v16, 16
-; SI-NEXT:    v_alignbit_b32 v13, v14, v12, 16
-; SI-NEXT:    v_alignbit_b32 v9, v10, v8, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v4, 16
-; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-NEXT:  .LBB13_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v27, s57
+; SI-NEXT:    v_mov_b32_e32 v28, s4
+; SI-NEXT:    v_mov_b32_e32 v29, s6
+; SI-NEXT:    v_mov_b32_e32 v30, s5
+; SI-NEXT:    v_mov_b32_e32 v31, s56
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB13_4:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; implicit-def: $sgpr44
+; SI-NEXT:    ; implicit-def: $sgpr63
+; SI-NEXT:    ; implicit-def: $sgpr42
+; SI-NEXT:    ; implicit-def: $sgpr62
+; SI-NEXT:    ; implicit-def: $sgpr40
+; SI-NEXT:    ; implicit-def: $sgpr61
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr59
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr58
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr57
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr56
 ; SI-NEXT:    s_branch .LBB13_2
 ;
 ; VI-LABEL: bitcast_v16i32_to_v32i16_scalar:
@@ -9385,386 +9402,449 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32
 ; SI-LABEL: bitcast_v16i32_to_v64i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_writelane_b32 v4, s30, 0
+; SI-NEXT:    v_writelane_b32 v4, s31, 1
+; SI-NEXT:    v_writelane_b32 v4, s34, 2
+; SI-NEXT:    v_writelane_b32 v4, s35, 3
+; SI-NEXT:    v_writelane_b32 v4, s36, 4
+; SI-NEXT:    v_writelane_b32 v4, s37, 5
+; SI-NEXT:    v_writelane_b32 v4, s38, 6
+; SI-NEXT:    v_writelane_b32 v4, s39, 7
+; SI-NEXT:    v_writelane_b32 v4, s48, 8
+; SI-NEXT:    v_writelane_b32 v4, s49, 9
+; SI-NEXT:    v_writelane_b32 v4, s50, 10
+; SI-NEXT:    v_writelane_b32 v4, s51, 11
+; SI-NEXT:    v_writelane_b32 v4, s52, 12
+; SI-NEXT:    v_writelane_b32 v4, s53, 13
+; SI-NEXT:    v_writelane_b32 v4, s54, 14
+; SI-NEXT:    v_writelane_b32 v4, s55, 15
+; SI-NEXT:    v_writelane_b32 v4, s64, 16
+; SI-NEXT:    v_writelane_b32 v4, s65, 17
+; SI-NEXT:    v_writelane_b32 v4, s66, 18
+; SI-NEXT:    v_writelane_b32 v4, s67, 19
+; SI-NEXT:    v_writelane_b32 v4, s68, 20
+; SI-NEXT:    v_writelane_b32 v4, s69, 21
+; SI-NEXT:    v_writelane_b32 v4, s70, 22
+; SI-NEXT:    v_writelane_b32 v4, s71, 23
+; SI-NEXT:    v_writelane_b32 v4, s80, 24
+; SI-NEXT:    v_writelane_b32 v4, s81, 25
+; SI-NEXT:    v_writelane_b32 v4, s82, 26
+; SI-NEXT:    v_writelane_b32 v4, s83, 27
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; SI-NEXT:    v_readfirstlane_b32 s7, v1
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_readfirstlane_b32 s6, v2
+; SI-NEXT:    v_writelane_b32 v4, s84, 28
+; SI-NEXT:    v_readfirstlane_b32 s4, v1
+; SI-NEXT:    s_and_b64 s[6:7], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s5, v2
+; SI-NEXT:    v_writelane_b32 v4, s85, 29
 ; SI-NEXT:    s_cbranch_scc0 .LBB25_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v9, s26
-; SI-NEXT:    v_mov_b32_e32 v3, s7
-; SI-NEXT:    v_mov_b32_e32 v6, s28
-; SI-NEXT:    v_alignbit_b32 v7, s27, v9, 24
-; SI-NEXT:    v_alignbit_b32 v8, s27, v9, 16
-; SI-NEXT:    v_alignbit_b32 v10, s27, v9, 8
-; SI-NEXT:    v_mov_b32_e32 v9, s24
-; SI-NEXT:    v_mov_b32_e32 v14, s22
-; SI-NEXT:    v_mov_b32_e32 v18, s20
-; SI-NEXT:    v_mov_b32_e32 v21, s18
-; SI-NEXT:    v_mov_b32_e32 v22, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v3, 24
-; SI-NEXT:    v_alignbit_b32 v2, s6, v3, 16
-; SI-NEXT:    v_alignbit_b32 v3, s6, v3, 8
-; SI-NEXT:    v_alignbit_b32 v4, s29, v6, 24
-; SI-NEXT:    v_alignbit_b32 v5, s29, v6, 16
-; SI-NEXT:    v_alignbit_b32 v6, s29, v6, 8
-; SI-NEXT:    v_alignbit_b32 v13, s25, v9, 24
-; SI-NEXT:    v_alignbit_b32 v15, s25, v9, 16
-; SI-NEXT:    v_alignbit_b32 v9, s25, v9, 8
-; SI-NEXT:    v_alignbit_b32 v11, s23, v14, 24
-; SI-NEXT:    v_alignbit_b32 v12, s23, v14, 16
-; SI-NEXT:    v_alignbit_b32 v14, s23, v14, 8
-; SI-NEXT:    v_alignbit_b32 v16, s21, v18, 24
-; SI-NEXT:    v_alignbit_b32 v17, s21, v18, 16
-; SI-NEXT:    v_alignbit_b32 v18, s21, v18, 8
-; SI-NEXT:    v_alignbit_b32 v19, s19, v21, 24
-; SI-NEXT:    v_alignbit_b32 v20, s19, v21, 16
-; SI-NEXT:    v_alignbit_b32 v21, s19, v21, 8
-; SI-NEXT:    v_alignbit_b32 v23, s17, v22, 24
-; SI-NEXT:    v_alignbit_b32 v24, s17, v22, 16
-; SI-NEXT:    v_alignbit_b32 v22, s17, v22, 8
-; SI-NEXT:    s_lshr_b32 s8, s6, 24
-; SI-NEXT:    s_lshr_b32 s9, s6, 16
-; SI-NEXT:    s_lshr_b32 s10, s6, 8
-; SI-NEXT:    s_lshr_b32 s11, s29, 24
-; SI-NEXT:    s_lshr_b32 s12, s29, 16
-; SI-NEXT:    s_lshr_b32 s13, s29, 8
-; SI-NEXT:    s_lshr_b32 s14, s27, 24
-; SI-NEXT:    s_lshr_b32 s15, s27, 16
-; SI-NEXT:    s_lshr_b32 s40, s27, 8
-; SI-NEXT:    s_lshr_b32 s41, s25, 24
-; SI-NEXT:    s_lshr_b32 s42, s25, 16
-; SI-NEXT:    s_lshr_b32 s43, s25, 8
-; SI-NEXT:    s_lshr_b32 s44, s23, 24
-; SI-NEXT:    s_lshr_b32 s45, s23, 16
-; SI-NEXT:    s_lshr_b32 s46, s23, 8
-; SI-NEXT:    s_lshr_b32 s47, s21, 24
-; SI-NEXT:    s_lshr_b32 s56, s21, 16
-; SI-NEXT:    s_lshr_b32 s57, s21, 8
-; SI-NEXT:    s_lshr_b32 s58, s19, 24
-; SI-NEXT:    s_lshr_b32 s59, s19, 16
-; SI-NEXT:    s_lshr_b32 s60, s19, 8
-; SI-NEXT:    s_lshr_b32 s61, s17, 24
-; SI-NEXT:    s_lshr_b32 s62, s17, 16
-; SI-NEXT:    s_lshr_b32 s63, s17, 8
+; SI-NEXT:    s_lshr_b32 s38, s5, 24
+; SI-NEXT:    s_lshr_b32 s39, s5, 16
+; SI-NEXT:    s_lshr_b32 s48, s5, 8
+; SI-NEXT:    s_lshr_b32 s49, s29, 24
+; SI-NEXT:    s_lshr_b32 s50, s29, 16
+; SI-NEXT:    s_lshr_b32 s51, s29, 8
+; SI-NEXT:    s_lshr_b32 s52, s27, 24
+; SI-NEXT:    s_lshr_b32 s53, s27, 16
+; SI-NEXT:    s_lshr_b32 s54, s27, 8
+; SI-NEXT:    s_lshr_b32 s55, s25, 24
+; SI-NEXT:    s_lshr_b32 s64, s25, 16
+; SI-NEXT:    s_lshr_b32 s65, s25, 8
+; SI-NEXT:    s_lshr_b32 s66, s23, 24
+; SI-NEXT:    s_lshr_b32 s67, s23, 16
+; SI-NEXT:    s_lshr_b32 s68, s23, 8
+; SI-NEXT:    s_lshr_b32 s69, s21, 24
+; SI-NEXT:    s_lshr_b32 s70, s21, 16
+; SI-NEXT:    s_lshr_b32 s71, s21, 8
+; SI-NEXT:    s_lshr_b32 s80, s19, 24
+; SI-NEXT:    s_lshr_b32 s81, s19, 16
+; SI-NEXT:    s_lshr_b32 s82, s19, 8
+; SI-NEXT:    s_lshr_b32 s83, s17, 24
+; SI-NEXT:    s_lshr_b32 s84, s17, 16
+; SI-NEXT:    s_lshr_b32 s85, s17, 8
+; SI-NEXT:    s_lshr_b64 s[6:7], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[8:9], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[4:5], 8
+; SI-NEXT:    s_lshr_b64 s[12:13], s[28:29], 24
+; SI-NEXT:    s_lshr_b64 s[14:15], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[28:29], 8
+; SI-NEXT:    s_lshr_b64 s[42:43], s[26:27], 24
+; SI-NEXT:    s_lshr_b64 s[44:45], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[26:27], 8
+; SI-NEXT:    s_lshr_b64 s[56:57], s[24:25], 24
+; SI-NEXT:    s_lshr_b64 s[58:59], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[24:25], 8
+; SI-NEXT:    s_lshr_b64 s[62:63], s[22:23], 24
+; SI-NEXT:    s_lshr_b64 s[72:73], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[88:89], s[22:23], 8
+; SI-NEXT:    s_lshr_b64 s[74:75], s[20:21], 24
+; SI-NEXT:    s_lshr_b64 s[76:77], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[78:79], s[20:21], 8
+; SI-NEXT:    s_lshr_b64 s[90:91], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[92:93], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[94:95], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[30:31], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[34:35], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[36:37], s[16:17], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB25_3
 ; SI-NEXT:  .LBB25_2: ; %cmp.true
-; SI-NEXT:    s_add_i32 s26, s26, 3
+; SI-NEXT:    s_add_i32 s17, s17, 3
 ; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    s_add_i32 s19, s19, 3
 ; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_add_i32 s21, s21, 3
 ; SI-NEXT:    s_add_i32 s20, s20, 3
+; SI-NEXT:    s_add_i32 s23, s23, 3
 ; SI-NEXT:    s_add_i32 s22, s22, 3
+; SI-NEXT:    s_add_i32 s25, s25, 3
 ; SI-NEXT:    s_add_i32 s24, s24, 3
 ; SI-NEXT:    s_add_i32 s27, s27, 3
-; SI-NEXT:    s_add_i32 s28, s28, 3
-; SI-NEXT:    s_add_i32 s7, s7, 3
-; SI-NEXT:    v_mov_b32_e32 v9, s26
-; SI-NEXT:    s_add_i32 s17, s17, 3
-; SI-NEXT:    s_add_i32 s19, s19, 3
-; SI-NEXT:    s_add_i32 s21, s21, 3
-; SI-NEXT:    s_add_i32 s23, s23, 3
-; SI-NEXT:    s_add_i32 s25, s25, 3
+; SI-NEXT:    s_add_i32 s26, s26, 3
 ; SI-NEXT:    s_add_i32 s29, s29, 3
-; SI-NEXT:    s_add_i32 s6, s6, 3
-; SI-NEXT:    v_mov_b32_e32 v3, s7
-; SI-NEXT:    v_mov_b32_e32 v6, s28
-; SI-NEXT:    v_alignbit_b32 v7, s27, v9, 24
-; SI-NEXT:    v_alignbit_b32 v8, s27, v9, 16
-; SI-NEXT:    v_alignbit_b32 v10, s27, v9, 8
-; SI-NEXT:    v_mov_b32_e32 v9, s24
-; SI-NEXT:    v_mov_b32_e32 v14, s22
-; SI-NEXT:    v_mov_b32_e32 v18, s20
-; SI-NEXT:    v_mov_b32_e32 v21, s18
-; SI-NEXT:    v_mov_b32_e32 v22, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v3, 24
-; SI-NEXT:    v_alignbit_b32 v2, s6, v3, 16
-; SI-NEXT:    v_alignbit_b32 v3, s6, v3, 8
-; SI-NEXT:    v_alignbit_b32 v4, s29, v6, 24
-; SI-NEXT:    v_alignbit_b32 v5, s29, v6, 16
-; SI-NEXT:    v_alignbit_b32 v6, s29, v6, 8
-; SI-NEXT:    v_alignbit_b32 v13, s25, v9, 24
-; SI-NEXT:    v_alignbit_b32 v15, s25, v9, 16
-; SI-NEXT:    v_alignbit_b32 v9, s25, v9, 8
-; SI-NEXT:    v_alignbit_b32 v11, s23, v14, 24
-; SI-NEXT:    v_alignbit_b32 v12, s23, v14, 16
-; SI-NEXT:    v_alignbit_b32 v14, s23, v14, 8
-; SI-NEXT:    v_alignbit_b32 v16, s21, v18, 24
-; SI-NEXT:    v_alignbit_b32 v17, s21, v18, 16
-; SI-NEXT:    v_alignbit_b32 v18, s21, v18, 8
-; SI-NEXT:    v_alignbit_b32 v19, s19, v21, 24
-; SI-NEXT:    v_alignbit_b32 v20, s19, v21, 16
-; SI-NEXT:    v_alignbit_b32 v21, s19, v21, 8
-; SI-NEXT:    v_alignbit_b32 v23, s17, v22, 24
-; SI-NEXT:    v_alignbit_b32 v24, s17, v22, 16
-; SI-NEXT:    v_alignbit_b32 v22, s17, v22, 8
-; SI-NEXT:    s_lshr_b32 s8, s6, 24
-; SI-NEXT:    s_lshr_b32 s9, s6, 16
-; SI-NEXT:    s_lshr_b32 s10, s6, 8
-; SI-NEXT:    s_lshr_b32 s11, s29, 24
-; SI-NEXT:    s_lshr_b32 s12, s29, 16
-; SI-NEXT:    s_lshr_b32 s13, s29, 8
-; SI-NEXT:    s_lshr_b32 s14, s27, 24
-; SI-NEXT:    s_lshr_b32 s15, s27, 16
-; SI-NEXT:    s_lshr_b32 s40, s27, 8
-; SI-NEXT:    s_lshr_b32 s41, s25, 24
-; SI-NEXT:    s_lshr_b32 s42, s25, 16
-; SI-NEXT:    s_lshr_b32 s43, s25, 8
-; SI-NEXT:    s_lshr_b32 s44, s23, 24
-; SI-NEXT:    s_lshr_b32 s45, s23, 16
-; SI-NEXT:    s_lshr_b32 s46, s23, 8
-; SI-NEXT:    s_lshr_b32 s47, s21, 24
-; SI-NEXT:    s_lshr_b32 s56, s21, 16
-; SI-NEXT:    s_lshr_b32 s57, s21, 8
-; SI-NEXT:    s_lshr_b32 s58, s19, 24
-; SI-NEXT:    s_lshr_b32 s59, s19, 16
-; SI-NEXT:    s_lshr_b32 s60, s19, 8
-; SI-NEXT:    s_lshr_b32 s61, s17, 24
-; SI-NEXT:    s_lshr_b32 s62, s17, 16
-; SI-NEXT:    s_lshr_b32 s63, s17, 8
+; SI-NEXT:    s_add_i32 s28, s28, 3
+; SI-NEXT:    s_add_i32 s5, s5, 3
+; SI-NEXT:    s_add_i32 s4, s4, 3
+; SI-NEXT:    s_lshr_b64 s[6:7], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[8:9], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[4:5], 8
+; SI-NEXT:    s_lshr_b64 s[12:13], s[28:29], 24
+; SI-NEXT:    s_lshr_b64 s[14:15], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[28:29], 8
+; SI-NEXT:    s_lshr_b64 s[42:43], s[26:27], 24
+; SI-NEXT:    s_lshr_b64 s[44:45], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[26:27], 8
+; SI-NEXT:    s_lshr_b64 s[56:57], s[24:25], 24
+; SI-NEXT:    s_lshr_b64 s[58:59], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[24:25], 8
+; SI-NEXT:    s_lshr_b64 s[62:63], s[22:23], 24
+; SI-NEXT:    s_lshr_b64 s[72:73], s[22:23], 16
+; SI-NEXT:    s_lshr_b32 s38, s5, 24
+; SI-NEXT:    s_lshr_b32 s39, s5, 16
+; SI-NEXT:    s_lshr_b32 s48, s5, 8
+; SI-NEXT:    s_lshr_b32 s49, s29, 24
+; SI-NEXT:    s_lshr_b32 s50, s29, 16
+; SI-NEXT:    s_lshr_b32 s51, s29, 8
+; SI-NEXT:    s_lshr_b32 s52, s27, 24
+; SI-NEXT:    s_lshr_b32 s53, s27, 16
+; SI-NEXT:    s_lshr_b32 s54, s27, 8
+; SI-NEXT:    s_lshr_b32 s55, s25, 24
+; SI-NEXT:    s_lshr_b32 s64, s25, 16
+; SI-NEXT:    s_lshr_b32 s65, s25, 8
+; SI-NEXT:    s_lshr_b32 s66, s23, 24
+; SI-NEXT:    s_lshr_b32 s67, s23, 16
+; SI-NEXT:    s_lshr_b32 s68, s23, 8
+; SI-NEXT:    s_lshr_b32 s69, s21, 24
+; SI-NEXT:    s_lshr_b32 s70, s21, 16
+; SI-NEXT:    s_lshr_b32 s71, s21, 8
+; SI-NEXT:    s_lshr_b32 s80, s19, 24
+; SI-NEXT:    s_lshr_b32 s81, s19, 16
+; SI-NEXT:    s_lshr_b32 s82, s19, 8
+; SI-NEXT:    s_lshr_b32 s83, s17, 24
+; SI-NEXT:    s_lshr_b32 s84, s17, 16
+; SI-NEXT:    s_lshr_b32 s85, s17, 8
+; SI-NEXT:    s_lshr_b64 s[88:89], s[22:23], 8
+; SI-NEXT:    s_lshr_b64 s[74:75], s[20:21], 24
+; SI-NEXT:    s_lshr_b64 s[76:77], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[78:79], s[20:21], 8
+; SI-NEXT:    s_lshr_b64 s[90:91], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[92:93], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[94:95], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[30:31], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[34:35], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[36:37], s[16:17], 8
 ; SI-NEXT:  .LBB25_3: ; %end
-; SI-NEXT:    s_and_b32 s4, s16, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 8, v22
-; SI-NEXT:    v_or_b32_e32 v22, s4, v22
-; SI-NEXT:    s_and_b32 s4, s17, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s63, 8
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s62, 0xff
-; SI-NEXT:    v_and_b32_e32 v24, 0xff, v24
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s61, 24
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 24, v23
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
-; SI-NEXT:    v_or_b32_e32 v23, v23, v24
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_or_b32_e32 v22, v22, v23
-; SI-NEXT:    v_mov_b32_e32 v23, s4
-; SI-NEXT:    s_and_b32 s4, s18, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 8, v21
-; SI-NEXT:    v_or_b32_e32 v21, s4, v21
-; SI-NEXT:    s_and_b32 s4, s19, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s60, 8
-; SI-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s59, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 24, v19
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s58, 24
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
-; SI-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v22, vcc, 4, v0
-; SI-NEXT:    v_or_b32_e32 v19, v21, v19
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 8, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v23, v22, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v20, s4
-; SI-NEXT:    s_and_b32 s4, s20, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
-; SI-NEXT:    v_or_b32_e32 v18, s4, v18
-; SI-NEXT:    s_and_b32 s4, s21, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s57, 8
-; SI-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s56, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 24, v16
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s47, 24
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; SI-NEXT:    v_or_b32_e32 v16, v16, v17
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 12, v0
-; SI-NEXT:    v_or_b32_e32 v16, v18, v16
-; SI-NEXT:    v_add_i32_e32 v17, vcc, 16, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v20, v19, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v17, s4
-; SI-NEXT:    s_and_b32 s4, s22, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
-; SI-NEXT:    v_or_b32_e32 v14, s4, v14
-; SI-NEXT:    s_and_b32 s4, s23, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s46, 8
-; SI-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s45, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s44, 24
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; SI-NEXT:    v_or_b32_e32 v11, v11, v12
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
+; SI-NEXT:    s_lshl_b32 s7, s36, 8
+; SI-NEXT:    s_and_b32 s9, s16, 0xff
+; SI-NEXT:    s_or_b32 s7, s9, s7
+; SI-NEXT:    s_and_b32 s9, s34, 0xff
+; SI-NEXT:    s_lshl_b32 s11, s30, 24
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_or_b32 s9, s11, s9
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    s_and_b32 s7, s17, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s85, 8
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    s_and_b32 s9, s84, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s11, s83, 24
+; SI-NEXT:    s_or_b32 s9, s11, s9
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    s_lshl_b32 s7, s94, 8
+; SI-NEXT:    s_and_b32 s9, s18, 0xff
+; SI-NEXT:    s_or_b32 s7, s9, s7
+; SI-NEXT:    s_and_b32 s9, s92, 0xff
+; SI-NEXT:    s_lshl_b32 s11, s90, 24
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_or_b32 s9, s11, s9
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    v_mov_b32_e32 v3, s7
+; SI-NEXT:    s_and_b32 s7, s19, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s82, 8
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    s_and_b32 s9, s81, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s11, s80, 24
+; SI-NEXT:    s_or_b32 s9, s11, s9
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v16, vcc, 20, v0
-; SI-NEXT:    v_or_b32_e32 v11, v14, v11
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 24, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v17, v16, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v11, v12, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v12, s4
-; SI-NEXT:    s_and_b32 s4, s24, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 28, v0
-; SI-NEXT:    v_or_b32_e32 v9, s4, v9
-; SI-NEXT:    s_and_b32 s4, s25, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s43, 8
-; SI-NEXT:    buffer_store_dword v12, v11, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v15
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s42, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    s_and_b32 s7, s20, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s78, 8
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    s_and_b32 s9, s76, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s11, s74, 24
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s9, s11, s9
+; SI-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    s_and_b32 s7, s21, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s71, 8
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    s_and_b32 s9, s70, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s11, s69, 24
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s9, s11, s9
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 24, v13
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s41, 24
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; SI-NEXT:    v_or_b32_e32 v11, v12, v11
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
-; SI-NEXT:    v_or_b32_e32 v9, v9, v11
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 32, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v9, v11, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    s_and_b32 s7, s22, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s88, 8
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    s_and_b32 s9, s72, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s11, s62, 24
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s9, s11, s9
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 20, v0
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    s_and_b32 s7, s23, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s68, 8
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    s_and_b32 s9, s67, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s11, s66, 24
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s9, s11, s9
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 24, v0
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    s_and_b32 s7, s24, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s60, 8
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    s_and_b32 s9, s58, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s11, s56, 24
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s9, s11, s9
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    s_and_b32 s7, s25, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s65, 8
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    s_and_b32 s9, s64, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s11, s55, 24
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s9, s11, s9
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v0
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 36, v0
-; SI-NEXT:    v_mov_b32_e32 v11, s4
-; SI-NEXT:    buffer_store_dword v11, v9, s[0:3], 0 offen
-; SI-NEXT:    s_and_b32 s4, s26, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v10
-; SI-NEXT:    v_or_b32_e32 v9, s4, v9
-; SI-NEXT:    s_and_b32 s4, s27, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s40, 8
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s15, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s14, s14, 24
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s14, s5
-; SI-NEXT:    v_or_b32_e32 v7, v9, v7
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 40, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v8, s4
-; SI-NEXT:    s_and_b32 s4, s28, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
-; SI-NEXT:    v_or_b32_e32 v6, s4, v6
-; SI-NEXT:    s_and_b32 s4, s29, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s13, 8
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s12, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s11, s11, 24
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; SI-NEXT:    v_or_b32_e32 v4, v4, v5
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    s_and_b32 s7, s26, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s46, 8
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    s_and_b32 s9, s44, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s11, s42, 24
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s9, s11, s9
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 36, v0
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    s_and_b32 s7, s27, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s54, 8
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    s_and_b32 s9, s53, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s11, s52, 24
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s9, s11, s9
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 40, v0
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    s_and_b32 s7, s28, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s40, 8
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    s_and_b32 s9, s14, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s11, s12, 24
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s9, s11, s9
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 44, v0
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    s_and_b32 s7, s29, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s51, 8
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    s_and_b32 s9, s50, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s11, s49, 24
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s9, s11, s9
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 48, v0
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s10, 8
+; SI-NEXT:    s_or_b32 s4, s4, s7
+; SI-NEXT:    s_and_b32 s7, s8, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_lshl_b32 s6, s6, 24
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s11, s5
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 52, v0
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 44, v0
-; SI-NEXT:    v_or_b32_e32 v4, v6, v4
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v8, v7, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v5, s4
-; SI-NEXT:    s_and_b32 s4, s7, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s6, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s10, 8
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    s_and_b32 s4, s5, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s48, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s9, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; SI-NEXT:    s_and_b32 s5, s39, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s6, s8, 24
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    s_lshl_b32 s6, s38, 24
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s6, s5
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 52, v0
-; SI-NEXT:    v_or_b32_e32 v1, v3, v1
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 56, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    v_readlane_b32 s85, v4, 29
+; SI-NEXT:    v_readlane_b32 s84, v4, 28
+; SI-NEXT:    v_readlane_b32 s83, v4, 27
+; SI-NEXT:    v_readlane_b32 s82, v4, 26
+; SI-NEXT:    v_readlane_b32 s81, v4, 25
+; SI-NEXT:    v_readlane_b32 s80, v4, 24
+; SI-NEXT:    v_readlane_b32 s71, v4, 23
+; SI-NEXT:    v_readlane_b32 s70, v4, 22
+; SI-NEXT:    v_readlane_b32 s69, v4, 21
+; SI-NEXT:    v_readlane_b32 s68, v4, 20
+; SI-NEXT:    v_readlane_b32 s67, v4, 19
+; SI-NEXT:    v_readlane_b32 s66, v4, 18
+; SI-NEXT:    v_readlane_b32 s65, v4, 17
+; SI-NEXT:    v_readlane_b32 s64, v4, 16
+; SI-NEXT:    v_readlane_b32 s55, v4, 15
+; SI-NEXT:    v_readlane_b32 s54, v4, 14
+; SI-NEXT:    v_readlane_b32 s53, v4, 13
+; SI-NEXT:    v_readlane_b32 s52, v4, 12
+; SI-NEXT:    v_readlane_b32 s51, v4, 11
+; SI-NEXT:    v_readlane_b32 s50, v4, 10
+; SI-NEXT:    v_readlane_b32 s49, v4, 9
+; SI-NEXT:    v_readlane_b32 s48, v4, 8
+; SI-NEXT:    v_readlane_b32 s39, v4, 7
+; SI-NEXT:    v_readlane_b32 s38, v4, 6
+; SI-NEXT:    v_readlane_b32 s37, v4, 5
+; SI-NEXT:    v_readlane_b32 s36, v4, 4
+; SI-NEXT:    v_readlane_b32 s35, v4, 3
+; SI-NEXT:    v_readlane_b32 s34, v4, 2
+; SI-NEXT:    v_readlane_b32 s31, v4, 1
+; SI-NEXT:    v_readlane_b32 s30, v4, 0
+; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB25_4:
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $sgpr63
+; SI-NEXT:    ; implicit-def: $sgpr36
+; SI-NEXT:    ; implicit-def: $sgpr34
+; SI-NEXT:    ; implicit-def: $sgpr30
+; SI-NEXT:    ; implicit-def: $sgpr85
+; SI-NEXT:    ; implicit-def: $sgpr84
+; SI-NEXT:    ; implicit-def: $sgpr83
+; SI-NEXT:    ; implicit-def: $sgpr94
+; SI-NEXT:    ; implicit-def: $sgpr92
+; SI-NEXT:    ; implicit-def: $sgpr90
+; SI-NEXT:    ; implicit-def: $sgpr82
+; SI-NEXT:    ; implicit-def: $sgpr81
+; SI-NEXT:    ; implicit-def: $sgpr80
+; SI-NEXT:    ; implicit-def: $sgpr78
+; SI-NEXT:    ; implicit-def: $sgpr76
+; SI-NEXT:    ; implicit-def: $sgpr74
+; SI-NEXT:    ; implicit-def: $sgpr71
+; SI-NEXT:    ; implicit-def: $sgpr70
+; SI-NEXT:    ; implicit-def: $sgpr69
+; SI-NEXT:    ; implicit-def: $sgpr68
+; SI-NEXT:    ; implicit-def: $sgpr67
+; SI-NEXT:    ; implicit-def: $sgpr66
+; SI-NEXT:    ; implicit-def: $sgpr65
+; SI-NEXT:    ; implicit-def: $sgpr64
+; SI-NEXT:    ; implicit-def: $sgpr55
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; implicit-def: $sgpr53
+; SI-NEXT:    ; implicit-def: $sgpr52
+; SI-NEXT:    ; implicit-def: $sgpr51
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; implicit-def: $sgpr49
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr39
+; SI-NEXT:    ; implicit-def: $sgpr38
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    ; implicit-def: $sgpr72
 ; SI-NEXT:    ; implicit-def: $sgpr62
-; SI-NEXT:    ; implicit-def: $sgpr61
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    ; implicit-def: $vgpr19
 ; SI-NEXT:    ; implicit-def: $sgpr60
-; SI-NEXT:    ; implicit-def: $sgpr59
 ; SI-NEXT:    ; implicit-def: $sgpr58
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $sgpr57
 ; SI-NEXT:    ; implicit-def: $sgpr56
-; SI-NEXT:    ; implicit-def: $sgpr47
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $vgpr11
 ; SI-NEXT:    ; implicit-def: $sgpr46
-; SI-NEXT:    ; implicit-def: $sgpr45
 ; SI-NEXT:    ; implicit-def: $sgpr44
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $sgpr43
 ; SI-NEXT:    ; implicit-def: $sgpr42
-; SI-NEXT:    ; implicit-def: $sgpr41
 ; SI-NEXT:    ; implicit-def: $sgpr40
-; SI-NEXT:    ; implicit-def: $sgpr15
 ; SI-NEXT:    ; implicit-def: $sgpr14
-; SI-NEXT:    ; implicit-def: $sgpr13
 ; SI-NEXT:    ; implicit-def: $sgpr12
-; SI-NEXT:    ; implicit-def: $sgpr11
 ; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $sgpr9
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr6
 ; SI-NEXT:    s_branch .LBB25_2
 ;
 ; VI-LABEL: bitcast_v16i32_to_v64i8_scalar:
@@ -16750,78 +16830,93 @@ define inreg <32 x i16> @bitcast_v16f32_to_v32i16_scalar(<16 x float> inreg %a,
 ; SI-LABEL: bitcast_v16f32_to_v32i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v3, v2
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; SI-NEXT:    v_mov_b32_e32 v30, v1
-; SI-NEXT:    v_mov_b32_e32 v28, v0
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v4, s18
-; SI-NEXT:    v_mov_b32_e32 v6, s19
-; SI-NEXT:    v_mov_b32_e32 v8, s20
-; SI-NEXT:    v_mov_b32_e32 v10, s21
-; SI-NEXT:    v_mov_b32_e32 v12, s22
-; SI-NEXT:    v_mov_b32_e32 v14, s23
-; SI-NEXT:    v_mov_b32_e32 v16, s24
-; SI-NEXT:    v_mov_b32_e32 v18, s25
-; SI-NEXT:    v_mov_b32_e32 v20, s26
-; SI-NEXT:    v_mov_b32_e32 v22, s27
-; SI-NEXT:    v_mov_b32_e32 v24, s28
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; SI-NEXT:    v_mov_b32_e32 v33, v1
+; SI-NEXT:    v_mov_b32_e32 v32, v0
+; SI-NEXT:    v_mov_b32_e32 v34, s16
+; SI-NEXT:    v_mov_b32_e32 v35, s17
+; SI-NEXT:    v_mov_b32_e32 v36, s18
+; SI-NEXT:    v_mov_b32_e32 v37, s19
+; SI-NEXT:    v_mov_b32_e32 v38, s20
+; SI-NEXT:    v_mov_b32_e32 v39, s21
+; SI-NEXT:    v_mov_b32_e32 v48, s22
+; SI-NEXT:    v_mov_b32_e32 v49, s23
+; SI-NEXT:    v_mov_b32_e32 v50, s24
+; SI-NEXT:    v_mov_b32_e32 v51, s25
+; SI-NEXT:    v_mov_b32_e32 v52, s26
+; SI-NEXT:    v_mov_b32_e32 v53, s27
+; SI-NEXT:    v_mov_b32_e32 v54, s28
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mov_b32_e32 v26, s29
+; SI-NEXT:    v_mov_b32_e32 v55, s29
 ; SI-NEXT:    s_cbranch_scc0 .LBB37_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_alignbit_b32 v29, v30, v28, 16
-; SI-NEXT:    v_alignbit_b32 v25, v26, v24, 16
-; SI-NEXT:    v_alignbit_b32 v21, v22, v20, 16
-; SI-NEXT:    v_alignbit_b32 v17, v18, v16, 16
-; SI-NEXT:    v_alignbit_b32 v13, v14, v12, 16
-; SI-NEXT:    v_alignbit_b32 v9, v10, v8, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v4, 16
-; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v33
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v55
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v53
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v51
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v49
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v39
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v37
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v35
+; SI-NEXT:    v_lshr_b64 v[29:30], v[32:33], 16
+; SI-NEXT:    v_lshr_b64 v[25:26], v[54:55], 16
+; SI-NEXT:    v_lshr_b64 v[21:22], v[52:53], 16
+; SI-NEXT:    v_lshr_b64 v[17:18], v[50:51], 16
+; SI-NEXT:    v_lshr_b64 v[13:14], v[48:49], 16
+; SI-NEXT:    v_lshr_b64 v[9:10], v[38:39], 16
+; SI-NEXT:    v_lshr_b64 v[5:6], v[36:37], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[34:35], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB37_3
 ; SI-NEXT:  .LBB37_2: ; %cmp.true
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; SI-NEXT:    v_add_f32_e32 v6, 1.0, v6
-; SI-NEXT:    v_add_f32_e32 v4, 1.0, v4
-; SI-NEXT:    v_add_f32_e32 v10, 1.0, v10
-; SI-NEXT:    v_add_f32_e32 v8, 1.0, v8
-; SI-NEXT:    v_add_f32_e32 v14, 1.0, v14
-; SI-NEXT:    v_add_f32_e32 v12, 1.0, v12
-; SI-NEXT:    v_add_f32_e32 v18, 1.0, v18
-; SI-NEXT:    v_add_f32_e32 v16, 1.0, v16
-; SI-NEXT:    v_add_f32_e32 v22, 1.0, v22
-; SI-NEXT:    v_add_f32_e32 v20, 1.0, v20
-; SI-NEXT:    v_add_f32_e32 v26, 1.0, v26
-; SI-NEXT:    v_add_f32_e32 v24, 1.0, v24
-; SI-NEXT:    v_add_f32_e32 v30, 1.0, v30
-; SI-NEXT:    v_add_f32_e32 v28, 1.0, v28
-; SI-NEXT:    v_alignbit_b32 v29, v30, v28, 16
-; SI-NEXT:    v_alignbit_b32 v25, v26, v24, 16
-; SI-NEXT:    v_alignbit_b32 v21, v22, v20, 16
-; SI-NEXT:    v_alignbit_b32 v17, v18, v16, 16
-; SI-NEXT:    v_alignbit_b32 v13, v14, v12, 16
-; SI-NEXT:    v_alignbit_b32 v9, v10, v8, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v4, 16
-; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; SI-NEXT:    v_add_f32_e32 v35, 1.0, v35
+; SI-NEXT:    v_add_f32_e32 v34, 1.0, v34
+; SI-NEXT:    v_add_f32_e32 v37, 1.0, v37
+; SI-NEXT:    v_add_f32_e32 v36, 1.0, v36
+; SI-NEXT:    v_add_f32_e32 v39, 1.0, v39
+; SI-NEXT:    v_add_f32_e32 v38, 1.0, v38
+; SI-NEXT:    v_add_f32_e32 v49, 1.0, v49
+; SI-NEXT:    v_add_f32_e32 v48, 1.0, v48
+; SI-NEXT:    v_add_f32_e32 v51, 1.0, v51
+; SI-NEXT:    v_add_f32_e32 v50, 1.0, v50
+; SI-NEXT:    v_add_f32_e32 v53, 1.0, v53
+; SI-NEXT:    v_add_f32_e32 v52, 1.0, v52
+; SI-NEXT:    v_add_f32_e32 v55, 1.0, v55
+; SI-NEXT:    v_add_f32_e32 v54, 1.0, v54
+; SI-NEXT:    v_add_f32_e32 v33, 1.0, v33
+; SI-NEXT:    v_add_f32_e32 v32, 1.0, v32
+; SI-NEXT:    v_lshr_b64 v[29:30], v[32:33], 16
+; SI-NEXT:    v_lshr_b64 v[25:26], v[54:55], 16
+; SI-NEXT:    v_lshr_b64 v[21:22], v[52:53], 16
+; SI-NEXT:    v_lshr_b64 v[17:18], v[50:51], 16
+; SI-NEXT:    v_lshr_b64 v[13:14], v[48:49], 16
+; SI-NEXT:    v_lshr_b64 v[9:10], v[38:39], 16
+; SI-NEXT:    v_lshr_b64 v[5:6], v[36:37], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[34:35], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v33
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v55
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v53
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v51
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v49
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v39
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v37
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v35
 ; SI-NEXT:  .LBB37_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v0, v34
+; SI-NEXT:    v_mov_b32_e32 v2, v35
+; SI-NEXT:    v_mov_b32_e32 v4, v36
+; SI-NEXT:    v_mov_b32_e32 v6, v37
+; SI-NEXT:    v_mov_b32_e32 v8, v38
+; SI-NEXT:    v_mov_b32_e32 v10, v39
+; SI-NEXT:    v_mov_b32_e32 v12, v48
+; SI-NEXT:    v_mov_b32_e32 v14, v49
+; SI-NEXT:    v_mov_b32_e32 v16, v50
+; SI-NEXT:    v_mov_b32_e32 v18, v51
+; SI-NEXT:    v_mov_b32_e32 v20, v52
+; SI-NEXT:    v_mov_b32_e32 v22, v53
+; SI-NEXT:    v_mov_b32_e32 v24, v54
+; SI-NEXT:    v_mov_b32_e32 v26, v55
+; SI-NEXT:    v_mov_b32_e32 v28, v32
+; SI-NEXT:    v_mov_b32_e32 v30, v33
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB37_4:
 ; SI-NEXT:    ; implicit-def: $vgpr1
@@ -16836,10 +16931,10 @@ define inreg <32 x i16> @bitcast_v16f32_to_v32i16_scalar(<16 x float> inreg %a,
 ; SI-NEXT:    ; implicit-def: $vgpr19
 ; SI-NEXT:    ; implicit-def: $vgpr21
 ; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr25
 ; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr25
+; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    s_branch .LBB37_2
 ;
 ; VI-LABEL: bitcast_v16f32_to_v32i16_scalar:
@@ -23954,433 +24049,494 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3
 ; SI-LABEL: bitcast_v16f32_to_v64i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_writelane_b32 v40, s30, 0
+; SI-NEXT:    v_writelane_b32 v40, s31, 1
+; SI-NEXT:    v_writelane_b32 v40, s34, 2
+; SI-NEXT:    v_writelane_b32 v40, s35, 3
+; SI-NEXT:    v_writelane_b32 v40, s36, 4
+; SI-NEXT:    v_writelane_b32 v40, s37, 5
+; SI-NEXT:    v_writelane_b32 v40, s38, 6
+; SI-NEXT:    v_writelane_b32 v40, s39, 7
+; SI-NEXT:    v_writelane_b32 v40, s48, 8
+; SI-NEXT:    v_writelane_b32 v40, s49, 9
+; SI-NEXT:    v_writelane_b32 v40, s50, 10
+; SI-NEXT:    v_writelane_b32 v40, s51, 11
+; SI-NEXT:    v_writelane_b32 v40, s52, 12
+; SI-NEXT:    v_writelane_b32 v40, s53, 13
+; SI-NEXT:    v_writelane_b32 v40, s54, 14
+; SI-NEXT:    v_writelane_b32 v40, s55, 15
+; SI-NEXT:    v_writelane_b32 v40, s64, 16
+; SI-NEXT:    v_writelane_b32 v40, s65, 17
+; SI-NEXT:    v_writelane_b32 v40, s66, 18
+; SI-NEXT:    v_writelane_b32 v40, s67, 19
+; SI-NEXT:    v_writelane_b32 v40, s68, 20
+; SI-NEXT:    v_writelane_b32 v40, s69, 21
+; SI-NEXT:    v_writelane_b32 v40, s70, 22
+; SI-NEXT:    v_writelane_b32 v40, s71, 23
+; SI-NEXT:    v_writelane_b32 v40, s80, 24
+; SI-NEXT:    v_writelane_b32 v40, s81, 25
+; SI-NEXT:    v_writelane_b32 v40, s82, 26
+; SI-NEXT:    v_writelane_b32 v40, s83, 27
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; SI-NEXT:    v_mov_b32_e32 v28, s16
-; SI-NEXT:    v_mov_b32_e32 v25, s17
-; SI-NEXT:    v_mov_b32_e32 v20, s18
-; SI-NEXT:    v_mov_b32_e32 v19, s19
+; SI-NEXT:    v_writelane_b32 v40, s84, 28
+; SI-NEXT:    v_readfirstlane_b32 s36, v1
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s37, v2
+; SI-NEXT:    v_writelane_b32 v40, s85, 29
+; SI-NEXT:    s_cbranch_scc0 .LBB49_3
+; SI-NEXT:  ; %bb.1: ; %cmp.false
+; SI-NEXT:    s_lshr_b32 s82, s37, 24
+; SI-NEXT:    s_lshr_b32 s84, s37, 16
+; SI-NEXT:    s_lshr_b32 s85, s37, 8
+; SI-NEXT:    s_lshr_b32 s71, s29, 24
+; SI-NEXT:    s_lshr_b32 s81, s29, 16
+; SI-NEXT:    s_lshr_b32 s83, s29, 8
+; SI-NEXT:    s_lshr_b32 s68, s27, 24
+; SI-NEXT:    s_lshr_b32 s70, s27, 16
+; SI-NEXT:    s_lshr_b32 s80, s27, 8
+; SI-NEXT:    s_lshr_b32 s65, s25, 24
+; SI-NEXT:    s_lshr_b32 s67, s25, 16
+; SI-NEXT:    s_lshr_b32 s69, s25, 8
+; SI-NEXT:    s_lshr_b32 s54, s23, 24
+; SI-NEXT:    s_lshr_b32 s64, s23, 16
+; SI-NEXT:    s_lshr_b32 s66, s23, 8
+; SI-NEXT:    s_lshr_b32 s51, s21, 24
+; SI-NEXT:    s_lshr_b32 s53, s21, 16
+; SI-NEXT:    s_lshr_b32 s55, s21, 8
+; SI-NEXT:    s_lshr_b32 s48, s19, 24
+; SI-NEXT:    s_lshr_b32 s50, s19, 16
+; SI-NEXT:    s_lshr_b32 s52, s19, 8
+; SI-NEXT:    s_lshr_b32 s38, s17, 24
+; SI-NEXT:    s_lshr_b32 s39, s17, 16
+; SI-NEXT:    s_lshr_b32 s49, s17, 8
+; SI-NEXT:    s_lshr_b64 s[4:5], s[36:37], 24
+; SI-NEXT:    s_lshr_b64 s[6:7], s[36:37], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[36:37], 8
+; SI-NEXT:    s_lshr_b64 s[10:11], s[28:29], 24
+; SI-NEXT:    s_lshr_b64 s[12:13], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[28:29], 8
+; SI-NEXT:    s_lshr_b64 s[40:41], s[26:27], 24
+; SI-NEXT:    s_lshr_b64 s[42:43], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[26:27], 8
+; SI-NEXT:    s_lshr_b64 s[46:47], s[24:25], 24
+; SI-NEXT:    s_lshr_b64 s[56:57], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[24:25], 8
+; SI-NEXT:    s_lshr_b64 s[72:73], s[22:23], 24
+; SI-NEXT:    s_lshr_b64 s[76:77], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[78:79], s[22:23], 8
+; SI-NEXT:    s_lshr_b64 s[60:61], s[20:21], 24
+; SI-NEXT:    s_lshr_b64 s[62:63], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[74:75], s[20:21], 8
+; SI-NEXT:    s_lshr_b64 s[88:89], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[90:91], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[92:93], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[94:95], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[30:31], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[34:35], s[16:17], 8
+; SI-NEXT:    s_cbranch_execnz .LBB49_4
+; SI-NEXT:  .LBB49_2: ; %cmp.true
+; SI-NEXT:    v_add_f32_e64 v20, s17, 1.0
+; SI-NEXT:    v_add_f32_e64 v22, s16, 1.0
+; SI-NEXT:    v_add_f32_e64 v16, s19, 1.0
+; SI-NEXT:    v_add_f32_e64 v18, s18, 1.0
+; SI-NEXT:    v_add_f32_e64 v11, s21, 1.0
+; SI-NEXT:    v_add_f32_e64 v15, s20, 1.0
+; SI-NEXT:    v_add_f32_e64 v9, s23, 1.0
+; SI-NEXT:    v_add_f32_e64 v10, s22, 1.0
+; SI-NEXT:    v_add_f32_e64 v7, s25, 1.0
+; SI-NEXT:    v_add_f32_e64 v8, s24, 1.0
+; SI-NEXT:    v_add_f32_e64 v5, s27, 1.0
+; SI-NEXT:    v_add_f32_e64 v6, s26, 1.0
+; SI-NEXT:    v_add_f32_e64 v3, s29, 1.0
+; SI-NEXT:    v_add_f32_e64 v4, s28, 1.0
+; SI-NEXT:    v_add_f32_e64 v1, s37, 1.0
+; SI-NEXT:    v_add_f32_e64 v2, s36, 1.0
+; SI-NEXT:    v_readfirstlane_b32 s16, v22
+; SI-NEXT:    v_readfirstlane_b32 s17, v20
+; SI-NEXT:    v_readfirstlane_b32 s18, v18
+; SI-NEXT:    v_readfirstlane_b32 s19, v16
+; SI-NEXT:    v_readfirstlane_b32 s20, v15
+; SI-NEXT:    v_readfirstlane_b32 s21, v11
+; SI-NEXT:    v_readfirstlane_b32 s22, v10
+; SI-NEXT:    v_readfirstlane_b32 s23, v9
+; SI-NEXT:    v_readfirstlane_b32 s24, v8
+; SI-NEXT:    v_readfirstlane_b32 s25, v7
+; SI-NEXT:    v_readfirstlane_b32 s26, v6
+; SI-NEXT:    v_readfirstlane_b32 s27, v5
+; SI-NEXT:    v_readfirstlane_b32 s14, v4
+; SI-NEXT:    v_readfirstlane_b32 s15, v3
+; SI-NEXT:    v_readfirstlane_b32 s8, v2
+; SI-NEXT:    v_readfirstlane_b32 s9, v1
+; SI-NEXT:    s_lshr_b64 s[4:5], s[8:9], 24
+; SI-NEXT:    s_lshr_b64 s[6:7], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[8:9], 8
+; SI-NEXT:    s_lshr_b64 s[10:11], s[14:15], 24
+; SI-NEXT:    s_lshr_b64 s[12:13], s[14:15], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[14:15], 8
+; SI-NEXT:    s_lshr_b64 s[40:41], s[26:27], 24
+; SI-NEXT:    s_lshr_b64 s[42:43], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[26:27], 8
+; SI-NEXT:    s_lshr_b64 s[46:47], s[24:25], 24
+; SI-NEXT:    s_lshr_b64 s[56:57], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[24:25], 8
+; SI-NEXT:    s_lshr_b64 s[72:73], s[22:23], 24
+; SI-NEXT:    s_lshr_b64 s[76:77], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[78:79], s[22:23], 8
+; SI-NEXT:    s_lshr_b64 s[60:61], s[20:21], 24
+; SI-NEXT:    s_lshr_b64 s[62:63], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[74:75], s[20:21], 8
+; SI-NEXT:    s_lshr_b64 s[88:89], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[90:91], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[92:93], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[94:95], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[30:31], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[34:35], s[16:17], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v12, 24, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 8, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 24, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v21, 8, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 24, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v25, 8, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v26, 24, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v28, 8, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v29, 24, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 8, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 24, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 8, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 24, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 8, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v38, 24, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 8, v20
+; SI-NEXT:    s_branch .LBB49_5
+; SI-NEXT:  .LBB49_3:
+; SI-NEXT:    ; implicit-def: $sgpr34
+; SI-NEXT:    ; implicit-def: $sgpr30
+; SI-NEXT:    ; implicit-def: $sgpr94
+; SI-NEXT:    ; implicit-def: $sgpr49
+; SI-NEXT:    ; implicit-def: $sgpr39
+; SI-NEXT:    ; implicit-def: $sgpr38
+; SI-NEXT:    ; implicit-def: $sgpr92
+; SI-NEXT:    ; implicit-def: $sgpr90
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    ; implicit-def: $sgpr52
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr74
+; SI-NEXT:    ; implicit-def: $sgpr62
+; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr55
+; SI-NEXT:    ; implicit-def: $sgpr53
+; SI-NEXT:    ; implicit-def: $sgpr51
+; SI-NEXT:    ; implicit-def: $sgpr66
+; SI-NEXT:    ; implicit-def: $sgpr64
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; implicit-def: $sgpr69
+; SI-NEXT:    ; implicit-def: $sgpr67
+; SI-NEXT:    ; implicit-def: $sgpr65
+; SI-NEXT:    ; implicit-def: $sgpr80
+; SI-NEXT:    ; implicit-def: $sgpr70
+; SI-NEXT:    ; implicit-def: $sgpr68
+; SI-NEXT:    ; implicit-def: $sgpr83
+; SI-NEXT:    ; implicit-def: $sgpr81
+; SI-NEXT:    ; implicit-def: $sgpr71
+; SI-NEXT:    ; implicit-def: $sgpr85
+; SI-NEXT:    ; implicit-def: $sgpr84
+; SI-NEXT:    ; implicit-def: $sgpr82
+; SI-NEXT:    ; implicit-def: $sgpr78
+; SI-NEXT:    ; implicit-def: $sgpr76
+; SI-NEXT:    ; implicit-def: $sgpr72
+; SI-NEXT:    ; implicit-def: $sgpr58
+; SI-NEXT:    ; implicit-def: $sgpr56
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr44
+; SI-NEXT:    ; implicit-def: $sgpr42
+; SI-NEXT:    ; implicit-def: $sgpr40
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    s_branch .LBB49_2
+; SI-NEXT:  .LBB49_4:
+; SI-NEXT:    v_mov_b32_e32 v22, s16
+; SI-NEXT:    v_mov_b32_e32 v20, s17
+; SI-NEXT:    v_mov_b32_e32 v18, s18
+; SI-NEXT:    v_mov_b32_e32 v16, s19
 ; SI-NEXT:    v_mov_b32_e32 v15, s20
-; SI-NEXT:    v_mov_b32_e32 v14, s21
-; SI-NEXT:    v_mov_b32_e32 v11, s22
+; SI-NEXT:    v_mov_b32_e32 v11, s21
+; SI-NEXT:    v_mov_b32_e32 v10, s22
 ; SI-NEXT:    v_mov_b32_e32 v9, s23
 ; SI-NEXT:    v_mov_b32_e32 v8, s24
 ; SI-NEXT:    v_mov_b32_e32 v7, s25
 ; SI-NEXT:    v_mov_b32_e32 v6, s26
 ; SI-NEXT:    v_mov_b32_e32 v5, s27
 ; SI-NEXT:    v_mov_b32_e32 v4, s28
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_mov_b32_e32 v3, s29
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    s_cbranch_scc0 .LBB49_4
-; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_alignbit_b32 v10, v2, v1, 24
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v12, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v13, v2, v1, 8
-; SI-NEXT:    v_alignbit_b32 v16, v3, v4, 24
-; SI-NEXT:    v_alignbit_b32 v17, v3, v4, 16
-; SI-NEXT:    v_alignbit_b32 v18, v3, v4, 8
-; SI-NEXT:    v_alignbit_b32 v21, v5, v6, 24
-; SI-NEXT:    v_alignbit_b32 v22, v5, v6, 16
-; SI-NEXT:    v_alignbit_b32 v23, v5, v6, 8
-; SI-NEXT:    v_alignbit_b32 v29, v7, v8, 24
-; SI-NEXT:    v_alignbit_b32 v30, v7, v8, 16
-; SI-NEXT:    v_alignbit_b32 v31, v7, v8, 8
-; SI-NEXT:    v_alignbit_b32 v35, v9, v11, 24
-; SI-NEXT:    v_alignbit_b32 v36, v9, v11, 16
-; SI-NEXT:    v_alignbit_b32 v37, v9, v11, 8
-; SI-NEXT:    v_alignbit_b32 v49, v14, v15, 24
-; SI-NEXT:    v_alignbit_b32 v50, v14, v15, 16
-; SI-NEXT:    v_alignbit_b32 v52, v14, v15, 8
-; SI-NEXT:    v_alignbit_b32 v55, v19, v20, 24
-; SI-NEXT:    v_alignbit_b32 v41, v19, v20, 16
-; SI-NEXT:    v_alignbit_b32 v43, v19, v20, 8
-; SI-NEXT:    v_alignbit_b32 v46, v25, v28, 24
-; SI-NEXT:    v_alignbit_b32 v56, v25, v28, 16
-; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    v_alignbit_b32 v58, v25, v28, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v24, 24, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 8, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 24, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v34, 8, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 24, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 8, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 24, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v54, 8, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v40, 24, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v44, 8, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 24, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v57, 8, v14
-; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 24, v19
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v19
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v61, 8, v19
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v62, 24, v25
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v25
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v25
-; SI-NEXT:    s_cbranch_execnz .LBB49_3
-; SI-NEXT:  .LBB49_2: ; %cmp.true
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_add_f32_e32 v25, 1.0, v25
-; SI-NEXT:    v_add_f32_e32 v28, 1.0, v28
-; SI-NEXT:    v_add_f32_e32 v19, 1.0, v19
-; SI-NEXT:    v_add_f32_e32 v20, 1.0, v20
-; SI-NEXT:    v_add_f32_e32 v14, 1.0, v14
-; SI-NEXT:    v_add_f32_e32 v15, 1.0, v15
-; SI-NEXT:    v_add_f32_e32 v9, 1.0, v9
-; SI-NEXT:    v_add_f32_e32 v11, 1.0, v11
-; SI-NEXT:    v_add_f32_e32 v7, 1.0, v7
-; SI-NEXT:    v_add_f32_e32 v8, 1.0, v8
-; SI-NEXT:    v_add_f32_e32 v5, 1.0, v5
-; SI-NEXT:    v_add_f32_e32 v6, 1.0, v6
-; SI-NEXT:    v_add_f32_e32 v3, 1.0, v3
-; SI-NEXT:    v_add_f32_e32 v4, 1.0, v4
-; SI-NEXT:    v_alignbit_b32 v10, v2, v1, 24
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    v_alignbit_b32 v12, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v13, v2, v1, 8
-; SI-NEXT:    v_alignbit_b32 v16, v3, v4, 24
-; SI-NEXT:    v_alignbit_b32 v17, v3, v4, 16
-; SI-NEXT:    v_alignbit_b32 v18, v3, v4, 8
-; SI-NEXT:    v_alignbit_b32 v21, v5, v6, 24
-; SI-NEXT:    v_alignbit_b32 v22, v5, v6, 16
-; SI-NEXT:    v_alignbit_b32 v23, v5, v6, 8
-; SI-NEXT:    v_alignbit_b32 v29, v7, v8, 24
-; SI-NEXT:    v_alignbit_b32 v30, v7, v8, 16
-; SI-NEXT:    v_alignbit_b32 v31, v7, v8, 8
-; SI-NEXT:    v_alignbit_b32 v35, v9, v11, 24
-; SI-NEXT:    v_alignbit_b32 v36, v9, v11, 16
-; SI-NEXT:    v_alignbit_b32 v37, v9, v11, 8
-; SI-NEXT:    v_alignbit_b32 v49, v14, v15, 24
-; SI-NEXT:    v_alignbit_b32 v50, v14, v15, 16
-; SI-NEXT:    v_alignbit_b32 v52, v14, v15, 8
-; SI-NEXT:    v_alignbit_b32 v55, v19, v20, 24
-; SI-NEXT:    v_alignbit_b32 v41, v19, v20, 16
-; SI-NEXT:    v_alignbit_b32 v43, v19, v20, 8
-; SI-NEXT:    v_alignbit_b32 v46, v25, v28, 24
-; SI-NEXT:    v_alignbit_b32 v56, v25, v28, 16
-; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    v_alignbit_b32 v58, v25, v28, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v24, 24, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 8, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 24, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v34, 8, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 24, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 8, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 24, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v54, 8, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v40, 24, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v44, 8, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 24, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v57, 8, v14
-; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 24, v19
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v19
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v61, 8, v19
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v62, 24, v25
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v25
+; SI-NEXT:    v_mov_b32_e32 v2, s36
+; SI-NEXT:    v_mov_b32_e32 v1, s37
+; SI-NEXT:    v_mov_b32_e32 v48, s49
+; SI-NEXT:    v_mov_b32_e32 v39, s39
+; SI-NEXT:    v_mov_b32_e32 v38, s38
+; SI-NEXT:    v_mov_b32_e32 v37, s52
+; SI-NEXT:    v_mov_b32_e32 v36, s50
+; SI-NEXT:    v_mov_b32_e32 v35, s48
+; SI-NEXT:    v_mov_b32_e32 v34, s55
+; SI-NEXT:    v_mov_b32_e32 v33, s53
+; SI-NEXT:    v_mov_b32_e32 v32, s51
+; SI-NEXT:    v_mov_b32_e32 v31, s66
+; SI-NEXT:    v_mov_b32_e32 v30, s64
+; SI-NEXT:    v_mov_b32_e32 v29, s54
+; SI-NEXT:    v_mov_b32_e32 v28, s69
+; SI-NEXT:    v_mov_b32_e32 v27, s67
+; SI-NEXT:    v_mov_b32_e32 v26, s65
+; SI-NEXT:    v_mov_b32_e32 v25, s80
+; SI-NEXT:    v_mov_b32_e32 v24, s70
+; SI-NEXT:    v_mov_b32_e32 v23, s68
+; SI-NEXT:    v_mov_b32_e32 v21, s83
+; SI-NEXT:    v_mov_b32_e32 v19, s81
+; SI-NEXT:    v_mov_b32_e32 v17, s71
+; SI-NEXT:    v_mov_b32_e32 v14, s85
+; SI-NEXT:    v_mov_b32_e32 v13, s84
+; SI-NEXT:    v_mov_b32_e32 v12, s82
+; SI-NEXT:  .LBB49_5: ; %end
+; SI-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; SI-NEXT:    s_lshl_b32 s5, s34, 8
+; SI-NEXT:    v_or_b32_e32 v22, s5, v22
+; SI-NEXT:    s_and_b32 s5, s30, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    s_lshl_b32 s7, s94, 24
+; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; SI-NEXT:    s_or_b32 s5, s7, s5
+; SI-NEXT:    v_or_b32_e32 v22, s5, v22
+; SI-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v20, 0xff, v20
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 8, v25
-; SI-NEXT:  .LBB49_3: ; %end
-; SI-NEXT:    v_and_b32_e32 v28, 0xff, v28
-; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    v_lshlrev_b32_e32 v58, 8, v58
-; SI-NEXT:    v_and_b32_e32 v56, 0xff, v56
-; SI-NEXT:    v_or_b32_e32 v28, v28, v58
-; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v56
-; SI-NEXT:    v_lshlrev_b32_e32 v46, 24, v46
-; SI-NEXT:    v_or_b32_e32 v46, v46, v56
-; SI-NEXT:    v_and_b32_e32 v28, 0xffff, v28
-; SI-NEXT:    v_and_b32_e32 v25, 0xff, v25
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
-; SI-NEXT:    v_or_b32_e32 v28, v28, v46
-; SI-NEXT:    v_or_b32_e32 v10, v25, v10
-; SI-NEXT:    v_and_b32_e32 v25, 0xff, v63
-; SI-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 8, v48
+; SI-NEXT:    v_or_b32_e32 v20, v20, v22
+; SI-NEXT:    v_and_b32_e32 v22, 0xff, v39
+; SI-NEXT:    v_and_b32_e32 v18, 0xff, v18
+; SI-NEXT:    s_lshl_b32 s5, s92, 8
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v38, 24, v38
+; SI-NEXT:    v_or_b32_e32 v18, s5, v18
+; SI-NEXT:    s_and_b32 s5, s90, 0xff
+; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; SI-NEXT:    v_or_b32_e32 v22, v38, v22
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    s_lshl_b32 s7, s88, 24
+; SI-NEXT:    v_or_b32_e32 v20, v20, v22
+; SI-NEXT:    v_add_i32_e32 v22, vcc, 4, v0
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; SI-NEXT:    s_or_b32 s5, s7, s5
+; SI-NEXT:    buffer_store_dword v20, v22, s[0:3], 0 offen
+; SI-NEXT:    v_or_b32_e32 v18, s5, v18
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 24, v62
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; SI-NEXT:    v_or_b32_e32 v25, v28, v25
-; SI-NEXT:    v_or_b32_e32 v10, v10, v25
-; SI-NEXT:    v_add_i32_e32 v25, vcc, 4, v0
-; SI-NEXT:    buffer_store_dword v10, v25, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v10, 0xff, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 8, v43
-; SI-NEXT:    v_or_b32_e32 v10, v10, v20
-; SI-NEXT:    v_and_b32_e32 v20, 0xff, v41
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 24, v55
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; SI-NEXT:    v_or_b32_e32 v20, v25, v20
-; SI-NEXT:    v_or_b32_e32 v10, v10, v20
 ; SI-NEXT:    v_add_i32_e32 v20, vcc, 8, v0
-; SI-NEXT:    buffer_store_dword v10, v20, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v18, v20, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v16, 0xff, v16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v10, 0xff, v19
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 8, v61
-; SI-NEXT:    v_or_b32_e32 v10, v10, v19
-; SI-NEXT:    v_and_b32_e32 v19, 0xff, v60
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 24, v59
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; SI-NEXT:    v_or_b32_e32 v19, v20, v19
-; SI-NEXT:    v_or_b32_e32 v10, v10, v19
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 12, v0
-; SI-NEXT:    buffer_store_dword v10, v19, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v10, 0xff, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 8, v52
-; SI-NEXT:    v_or_b32_e32 v10, v10, v15
-; SI-NEXT:    v_and_b32_e32 v15, 0xff, v50
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v37
+; SI-NEXT:    v_or_b32_e32 v16, v16, v18
+; SI-NEXT:    v_and_b32_e32 v18, 0xff, v36
+; SI-NEXT:    v_and_b32_e32 v15, 0xff, v15
+; SI-NEXT:    s_lshl_b32 s5, s74, 8
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 24, v35
+; SI-NEXT:    v_or_b32_e32 v15, s5, v15
+; SI-NEXT:    s_and_b32 s5, s62, 0xff
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; SI-NEXT:    v_or_b32_e32 v18, v20, v18
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    s_lshl_b32 s7, s60, 24
+; SI-NEXT:    v_or_b32_e32 v16, v16, v18
+; SI-NEXT:    v_add_i32_e32 v18, vcc, 12, v0
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; SI-NEXT:    s_or_b32 s5, s7, s5
+; SI-NEXT:    buffer_store_dword v16, v18, s[0:3], 0 offen
+; SI-NEXT:    v_or_b32_e32 v15, s5, v15
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_add_i32_e32 v16, vcc, 16, v0
+; SI-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 8, v34
+; SI-NEXT:    v_or_b32_e32 v11, v11, v15
+; SI-NEXT:    v_and_b32_e32 v15, 0xff, v33
+; SI-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; SI-NEXT:    s_lshl_b32 s5, s78, 8
 ; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 24, v49
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; SI-NEXT:    v_or_b32_e32 v15, v19, v15
-; SI-NEXT:    v_or_b32_e32 v10, v10, v15
-; SI-NEXT:    v_add_i32_e32 v15, vcc, 16, v0
-; SI-NEXT:    buffer_store_dword v10, v15, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v10, 0xff, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 8, v57
-; SI-NEXT:    v_or_b32_e32 v10, v10, v14
-; SI-NEXT:    v_and_b32_e32 v14, 0xff, v47
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 24, v45
+; SI-NEXT:    v_lshlrev_b32_e32 v16, 24, v32
+; SI-NEXT:    v_or_b32_e32 v10, s5, v10
+; SI-NEXT:    s_and_b32 s5, s76, 0xff
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT:    v_or_b32_e32 v15, v16, v15
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    s_lshl_b32 s7, s72, 24
+; SI-NEXT:    v_or_b32_e32 v11, v11, v15
+; SI-NEXT:    v_add_i32_e32 v15, vcc, 20, v0
 ; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; SI-NEXT:    v_or_b32_e32 v14, v15, v14
-; SI-NEXT:    v_or_b32_e32 v10, v10, v14
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 20, v0
-; SI-NEXT:    buffer_store_dword v10, v14, s[0:3], 0 offen
+; SI-NEXT:    s_or_b32 s5, s7, s5
+; SI-NEXT:    buffer_store_dword v11, v15, s[0:3], 0 offen
+; SI-NEXT:    v_or_b32_e32 v10, s5, v10
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v10, 0xff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v37
-; SI-NEXT:    v_or_b32_e32 v10, v10, v11
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v36
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 24, v35
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; SI-NEXT:    v_or_b32_e32 v11, v14, v11
-; SI-NEXT:    v_or_b32_e32 v10, v10, v11
 ; SI-NEXT:    v_add_i32_e32 v11, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 8, v44
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 8, v31
 ; SI-NEXT:    v_or_b32_e32 v9, v9, v10
-; SI-NEXT:    v_and_b32_e32 v10, 0xff, v42
+; SI-NEXT:    v_and_b32_e32 v10, 0xff, v30
+; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; SI-NEXT:    s_lshl_b32 s5, s58, 8
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v40
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v29
+; SI-NEXT:    v_or_b32_e32 v8, s5, v8
+; SI-NEXT:    s_and_b32 s5, s56, 0xff
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
 ; SI-NEXT:    v_or_b32_e32 v10, v11, v10
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    s_lshl_b32 s7, s46, 24
 ; SI-NEXT:    v_or_b32_e32 v9, v9, v10
 ; SI-NEXT:    v_add_i32_e32 v10, vcc, 28, v0
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT:    s_or_b32 s5, s7, s5
 ; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; SI-NEXT:    v_or_b32_e32 v8, s5, v8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v31
-; SI-NEXT:    v_or_b32_e32 v8, v8, v9
-; SI-NEXT:    v_and_b32_e32 v9, 0xff, v30
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v29
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; SI-NEXT:    v_or_b32_e32 v9, v10, v9
-; SI-NEXT:    v_or_b32_e32 v8, v8, v9
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 32, v0
 ; SI-NEXT:    buffer_store_dword v8, v9, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 8, v54
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 8, v28
 ; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v53
+; SI-NEXT:    v_and_b32_e32 v8, 0xff, v27
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; SI-NEXT:    s_lshl_b32 s5, s44, 8
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 24, v51
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 24, v26
+; SI-NEXT:    v_or_b32_e32 v6, s5, v6
+; SI-NEXT:    s_and_b32 s5, s42, 0xff
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; SI-NEXT:    v_or_b32_e32 v8, v9, v8
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    s_lshl_b32 s7, s40, 24
 ; SI-NEXT:    v_or_b32_e32 v7, v7, v8
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, 36, v0
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT:    s_or_b32 s5, s7, s5
 ; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; SI-NEXT:    v_or_b32_e32 v6, s5, v6
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v23
-; SI-NEXT:    v_or_b32_e32 v6, v6, v7
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 24, v21
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; SI-NEXT:    v_or_b32_e32 v7, v8, v7
-; SI-NEXT:    v_or_b32_e32 v6, v6, v7
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v48
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v25
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v39
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v24
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; SI-NEXT:    s_lshl_b32 s5, s14, 8
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v38
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v23
+; SI-NEXT:    v_or_b32_e32 v4, s5, v4
+; SI-NEXT:    s_and_b32 s5, s12, 0xff
 ; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; SI-NEXT:    v_or_b32_e32 v6, v7, v6
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    s_lshl_b32 s7, s10, 24
 ; SI-NEXT:    v_or_b32_e32 v5, v5, v6
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 44, v0
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; SI-NEXT:    s_or_b32 s5, s7, s5
 ; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; SI-NEXT:    v_or_b32_e32 v4, s5, v4
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v18
-; SI-NEXT:    v_or_b32_e32 v4, v4, v5
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v16
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    v_or_b32_e32 v4, v4, v5
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 48, v0
 ; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v21
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v4
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v33
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v19
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT:    s_lshl_b32 s5, s8, 8
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v32
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v17
+; SI-NEXT:    v_or_b32_e32 v2, s5, v2
+; SI-NEXT:    s_and_b32 s5, s6, 0xff
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-NEXT:    v_or_b32_e32 v4, v5, v4
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    s_lshl_b32 s4, s4, 24
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v4
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 52, v0
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_or_b32_e32 v2, s4, v2
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v13
-; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT:    v_or_b32_e32 v3, v4, v3
-; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
-; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v14
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v26
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v13
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v24
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v12
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    v_readlane_b32 s85, v40, 29
+; SI-NEXT:    v_readlane_b32 s84, v40, 28
+; SI-NEXT:    v_readlane_b32 s83, v40, 27
+; SI-NEXT:    v_readlane_b32 s82, v40, 26
+; SI-NEXT:    v_readlane_b32 s81, v40, 25
+; SI-NEXT:    v_readlane_b32 s80, v40, 24
+; SI-NEXT:    v_readlane_b32 s71, v40, 23
+; SI-NEXT:    v_readlane_b32 s70, v40, 22
+; SI-NEXT:    v_readlane_b32 s69, v40, 21
+; SI-NEXT:    v_readlane_b32 s68, v40, 20
+; SI-NEXT:    v_readlane_b32 s67, v40, 19
+; SI-NEXT:    v_readlane_b32 s66, v40, 18
+; SI-NEXT:    v_readlane_b32 s65, v40, 17
+; SI-NEXT:    v_readlane_b32 s64, v40, 16
+; SI-NEXT:    v_readlane_b32 s55, v40, 15
+; SI-NEXT:    v_readlane_b32 s54, v40, 14
+; SI-NEXT:    v_readlane_b32 s53, v40, 13
+; SI-NEXT:    v_readlane_b32 s52, v40, 12
+; SI-NEXT:    v_readlane_b32 s51, v40, 11
+; SI-NEXT:    v_readlane_b32 s50, v40, 10
+; SI-NEXT:    v_readlane_b32 s49, v40, 9
+; SI-NEXT:    v_readlane_b32 s48, v40, 8
+; SI-NEXT:    v_readlane_b32 s39, v40, 7
+; SI-NEXT:    v_readlane_b32 s38, v40, 6
+; SI-NEXT:    v_readlane_b32 s37, v40, 5
+; SI-NEXT:    v_readlane_b32 s36, v40, 4
+; SI-NEXT:    v_readlane_b32 s35, v40, 3
+; SI-NEXT:    v_readlane_b32 s34, v40, 2
+; SI-NEXT:    v_readlane_b32 s31, v40, 1
+; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
-; SI-NEXT:  .LBB49_4:
-; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $vgpr58
-; SI-NEXT:    ; implicit-def: $vgpr56
-; SI-NEXT:    ; implicit-def: $vgpr46
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr63
-; SI-NEXT:    ; implicit-def: $vgpr62
-; SI-NEXT:    ; implicit-def: $vgpr43
-; SI-NEXT:    ; implicit-def: $vgpr41
-; SI-NEXT:    ; implicit-def: $vgpr55
-; SI-NEXT:    ; implicit-def: $vgpr61
-; SI-NEXT:    ; implicit-def: $vgpr60
-; SI-NEXT:    ; implicit-def: $vgpr59
-; SI-NEXT:    ; implicit-def: $vgpr52
-; SI-NEXT:    ; implicit-def: $vgpr50
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; implicit-def: $vgpr57
-; SI-NEXT:    ; implicit-def: $vgpr47
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr37
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr44
-; SI-NEXT:    ; implicit-def: $vgpr42
-; SI-NEXT:    ; implicit-def: $vgpr40
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr54
-; SI-NEXT:    ; implicit-def: $vgpr53
-; SI-NEXT:    ; implicit-def: $vgpr51
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    ; implicit-def: $vgpr38
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $vgpr34
-; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; kill: killed $vgpr24
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    s_branch .LBB49_2
 ;
 ; VI-LABEL: bitcast_v16f32_to_v64i8_scalar:
 ; VI:       ; %bb.0:
@@ -30813,96 +30969,113 @@ define inreg <32 x i16> @bitcast_v8i64_to_v32i16_scalar(<8 x i64> inreg %a, i32
 ; SI-LABEL: bitcast_v8i64_to_v32i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v3, v2
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; SI-NEXT:    v_mov_b32_e32 v30, v1
-; SI-NEXT:    v_mov_b32_e32 v28, v0
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; SI-NEXT:    v_readfirstlane_b32 s4, v0
+; SI-NEXT:    s_and_b64 s[6:7], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s5, v1
+; SI-NEXT:    s_cbranch_scc0 .LBB57_4
+; SI-NEXT:  ; %bb.1: ; %cmp.false
+; SI-NEXT:    s_lshr_b32 s56, s5, 16
+; SI-NEXT:    s_lshr_b32 s57, s29, 16
+; SI-NEXT:    s_lshr_b32 s58, s27, 16
+; SI-NEXT:    s_lshr_b32 s59, s25, 16
+; SI-NEXT:    s_lshr_b32 s60, s23, 16
+; SI-NEXT:    s_lshr_b32 s61, s21, 16
+; SI-NEXT:    s_lshr_b32 s62, s19, 16
+; SI-NEXT:    s_lshr_b32 s63, s17, 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[16:17], 16
+; SI-NEXT:    s_cbranch_execnz .LBB57_3
+; SI-NEXT:  .LBB57_2: ; %cmp.true
+; SI-NEXT:    s_add_u32 s4, s4, 3
+; SI-NEXT:    s_addc_u32 s5, s5, 0
+; SI-NEXT:    s_add_u32 s28, s28, 3
+; SI-NEXT:    s_addc_u32 s29, s29, 0
+; SI-NEXT:    s_add_u32 s26, s26, 3
+; SI-NEXT:    s_addc_u32 s27, s27, 0
+; SI-NEXT:    s_add_u32 s24, s24, 3
+; SI-NEXT:    s_addc_u32 s25, s25, 0
+; SI-NEXT:    s_add_u32 s22, s22, 3
+; SI-NEXT:    s_addc_u32 s23, s23, 0
+; SI-NEXT:    s_add_u32 s20, s20, 3
+; SI-NEXT:    s_addc_u32 s21, s21, 0
+; SI-NEXT:    s_add_u32 s18, s18, 3
+; SI-NEXT:    s_addc_u32 s19, s19, 0
+; SI-NEXT:    s_add_u32 s16, s16, 3
+; SI-NEXT:    s_addc_u32 s17, s17, 0
+; SI-NEXT:    s_lshr_b32 s56, s5, 16
+; SI-NEXT:    s_lshr_b32 s57, s29, 16
+; SI-NEXT:    s_lshr_b32 s58, s27, 16
+; SI-NEXT:    s_lshr_b32 s59, s25, 16
+; SI-NEXT:    s_lshr_b32 s60, s23, 16
+; SI-NEXT:    s_lshr_b32 s61, s21, 16
+; SI-NEXT:    s_lshr_b32 s62, s19, 16
+; SI-NEXT:    s_lshr_b32 s63, s17, 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[16:17], 16
+; SI-NEXT:  .LBB57_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
+; SI-NEXT:    v_mov_b32_e32 v1, s44
 ; SI-NEXT:    v_mov_b32_e32 v2, s17
+; SI-NEXT:    v_mov_b32_e32 v3, s63
 ; SI-NEXT:    v_mov_b32_e32 v4, s18
+; SI-NEXT:    v_mov_b32_e32 v5, s42
 ; SI-NEXT:    v_mov_b32_e32 v6, s19
+; SI-NEXT:    v_mov_b32_e32 v7, s62
 ; SI-NEXT:    v_mov_b32_e32 v8, s20
+; SI-NEXT:    v_mov_b32_e32 v9, s40
 ; SI-NEXT:    v_mov_b32_e32 v10, s21
+; SI-NEXT:    v_mov_b32_e32 v11, s61
 ; SI-NEXT:    v_mov_b32_e32 v12, s22
+; SI-NEXT:    v_mov_b32_e32 v13, s14
 ; SI-NEXT:    v_mov_b32_e32 v14, s23
+; SI-NEXT:    v_mov_b32_e32 v15, s60
 ; SI-NEXT:    v_mov_b32_e32 v16, s24
+; SI-NEXT:    v_mov_b32_e32 v17, s12
 ; SI-NEXT:    v_mov_b32_e32 v18, s25
+; SI-NEXT:    v_mov_b32_e32 v19, s59
 ; SI-NEXT:    v_mov_b32_e32 v20, s26
+; SI-NEXT:    v_mov_b32_e32 v21, s10
 ; SI-NEXT:    v_mov_b32_e32 v22, s27
+; SI-NEXT:    v_mov_b32_e32 v23, s58
 ; SI-NEXT:    v_mov_b32_e32 v24, s28
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    v_mov_b32_e32 v25, s8
 ; SI-NEXT:    v_mov_b32_e32 v26, s29
-; SI-NEXT:    s_cbranch_scc0 .LBB57_4
-; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_alignbit_b32 v29, v30, v28, 16
-; SI-NEXT:    v_alignbit_b32 v25, v26, v24, 16
-; SI-NEXT:    v_alignbit_b32 v21, v22, v20, 16
-; SI-NEXT:    v_alignbit_b32 v17, v18, v16, 16
-; SI-NEXT:    v_alignbit_b32 v13, v14, v12, 16
-; SI-NEXT:    v_alignbit_b32 v9, v10, v8, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v4, 16
-; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-NEXT:    s_cbranch_execnz .LBB57_3
-; SI-NEXT:  .LBB57_2: ; %cmp.true
-; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
-; SI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 3, v4
-; SI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 3, v8
-; SI-NEXT:    v_addc_u32_e32 v10, vcc, 0, v10, vcc
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 3, v12
-; SI-NEXT:    v_addc_u32_e32 v14, vcc, 0, v14, vcc
-; SI-NEXT:    v_add_i32_e32 v16, vcc, 3, v16
-; SI-NEXT:    v_addc_u32_e32 v18, vcc, 0, v18, vcc
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 3, v20
-; SI-NEXT:    v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; SI-NEXT:    v_add_i32_e32 v24, vcc, 3, v24
-; SI-NEXT:    v_addc_u32_e32 v26, vcc, 0, v26, vcc
-; SI-NEXT:    v_add_i32_e32 v28, vcc, 3, v28
-; SI-NEXT:    v_addc_u32_e32 v30, vcc, 0, v30, vcc
-; SI-NEXT:    v_alignbit_b32 v29, v30, v28, 16
-; SI-NEXT:    v_alignbit_b32 v25, v26, v24, 16
-; SI-NEXT:    v_alignbit_b32 v21, v22, v20, 16
-; SI-NEXT:    v_alignbit_b32 v17, v18, v16, 16
-; SI-NEXT:    v_alignbit_b32 v13, v14, v12, 16
-; SI-NEXT:    v_alignbit_b32 v9, v10, v8, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v4, 16
-; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-NEXT:  .LBB57_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v27, s57
+; SI-NEXT:    v_mov_b32_e32 v28, s4
+; SI-NEXT:    v_mov_b32_e32 v29, s6
+; SI-NEXT:    v_mov_b32_e32 v30, s5
+; SI-NEXT:    v_mov_b32_e32 v31, s56
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB57_4:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; implicit-def: $sgpr44
+; SI-NEXT:    ; implicit-def: $sgpr63
+; SI-NEXT:    ; implicit-def: $sgpr42
+; SI-NEXT:    ; implicit-def: $sgpr62
+; SI-NEXT:    ; implicit-def: $sgpr40
+; SI-NEXT:    ; implicit-def: $sgpr61
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr59
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr58
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr57
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr56
 ; SI-NEXT:    s_branch .LBB57_2
 ;
 ; VI-LABEL: bitcast_v8i64_to_v32i16_scalar:
@@ -38073,386 +38246,449 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in
 ; SI-LABEL: bitcast_v8i64_to_v64i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_writelane_b32 v4, s30, 0
+; SI-NEXT:    v_writelane_b32 v4, s31, 1
+; SI-NEXT:    v_writelane_b32 v4, s34, 2
+; SI-NEXT:    v_writelane_b32 v4, s35, 3
+; SI-NEXT:    v_writelane_b32 v4, s36, 4
+; SI-NEXT:    v_writelane_b32 v4, s37, 5
+; SI-NEXT:    v_writelane_b32 v4, s38, 6
+; SI-NEXT:    v_writelane_b32 v4, s39, 7
+; SI-NEXT:    v_writelane_b32 v4, s48, 8
+; SI-NEXT:    v_writelane_b32 v4, s49, 9
+; SI-NEXT:    v_writelane_b32 v4, s50, 10
+; SI-NEXT:    v_writelane_b32 v4, s51, 11
+; SI-NEXT:    v_writelane_b32 v4, s52, 12
+; SI-NEXT:    v_writelane_b32 v4, s53, 13
+; SI-NEXT:    v_writelane_b32 v4, s54, 14
+; SI-NEXT:    v_writelane_b32 v4, s55, 15
+; SI-NEXT:    v_writelane_b32 v4, s64, 16
+; SI-NEXT:    v_writelane_b32 v4, s65, 17
+; SI-NEXT:    v_writelane_b32 v4, s66, 18
+; SI-NEXT:    v_writelane_b32 v4, s67, 19
+; SI-NEXT:    v_writelane_b32 v4, s68, 20
+; SI-NEXT:    v_writelane_b32 v4, s69, 21
+; SI-NEXT:    v_writelane_b32 v4, s70, 22
+; SI-NEXT:    v_writelane_b32 v4, s71, 23
+; SI-NEXT:    v_writelane_b32 v4, s80, 24
+; SI-NEXT:    v_writelane_b32 v4, s81, 25
+; SI-NEXT:    v_writelane_b32 v4, s82, 26
+; SI-NEXT:    v_writelane_b32 v4, s83, 27
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; SI-NEXT:    v_readfirstlane_b32 s7, v1
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_readfirstlane_b32 s6, v2
+; SI-NEXT:    v_writelane_b32 v4, s84, 28
+; SI-NEXT:    v_readfirstlane_b32 s4, v1
+; SI-NEXT:    s_and_b64 s[6:7], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s5, v2
+; SI-NEXT:    v_writelane_b32 v4, s85, 29
 ; SI-NEXT:    s_cbranch_scc0 .LBB69_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v3, s7
-; SI-NEXT:    v_mov_b32_e32 v6, s28
-; SI-NEXT:    v_mov_b32_e32 v9, s26
-; SI-NEXT:    v_mov_b32_e32 v10, s24
-; SI-NEXT:    v_mov_b32_e32 v14, s22
-; SI-NEXT:    v_mov_b32_e32 v18, s20
-; SI-NEXT:    v_mov_b32_e32 v21, s18
-; SI-NEXT:    v_mov_b32_e32 v22, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v3, 24
-; SI-NEXT:    v_alignbit_b32 v2, s6, v3, 16
-; SI-NEXT:    v_alignbit_b32 v3, s6, v3, 8
-; SI-NEXT:    v_alignbit_b32 v4, s29, v6, 24
-; SI-NEXT:    v_alignbit_b32 v5, s29, v6, 16
-; SI-NEXT:    v_alignbit_b32 v6, s29, v6, 8
-; SI-NEXT:    v_alignbit_b32 v7, s27, v9, 24
-; SI-NEXT:    v_alignbit_b32 v8, s27, v9, 16
-; SI-NEXT:    v_alignbit_b32 v9, s27, v9, 8
-; SI-NEXT:    v_alignbit_b32 v13, s25, v10, 24
-; SI-NEXT:    v_alignbit_b32 v15, s25, v10, 16
-; SI-NEXT:    v_alignbit_b32 v10, s25, v10, 8
-; SI-NEXT:    v_alignbit_b32 v11, s23, v14, 24
-; SI-NEXT:    v_alignbit_b32 v12, s23, v14, 16
-; SI-NEXT:    v_alignbit_b32 v14, s23, v14, 8
-; SI-NEXT:    v_alignbit_b32 v16, s21, v18, 24
-; SI-NEXT:    v_alignbit_b32 v17, s21, v18, 16
-; SI-NEXT:    v_alignbit_b32 v18, s21, v18, 8
-; SI-NEXT:    v_alignbit_b32 v19, s19, v21, 24
-; SI-NEXT:    v_alignbit_b32 v20, s19, v21, 16
-; SI-NEXT:    v_alignbit_b32 v21, s19, v21, 8
-; SI-NEXT:    v_alignbit_b32 v23, s17, v22, 24
-; SI-NEXT:    v_alignbit_b32 v24, s17, v22, 16
-; SI-NEXT:    v_alignbit_b32 v22, s17, v22, 8
-; SI-NEXT:    s_lshr_b32 s8, s6, 24
-; SI-NEXT:    s_lshr_b32 s9, s6, 16
-; SI-NEXT:    s_lshr_b32 s10, s6, 8
-; SI-NEXT:    s_lshr_b32 s11, s29, 24
-; SI-NEXT:    s_lshr_b32 s12, s29, 16
-; SI-NEXT:    s_lshr_b32 s13, s29, 8
-; SI-NEXT:    s_lshr_b32 s14, s27, 24
-; SI-NEXT:    s_lshr_b32 s15, s27, 16
-; SI-NEXT:    s_lshr_b32 s40, s27, 8
-; SI-NEXT:    s_lshr_b32 s41, s25, 24
-; SI-NEXT:    s_lshr_b32 s42, s25, 16
-; SI-NEXT:    s_lshr_b32 s43, s25, 8
-; SI-NEXT:    s_lshr_b32 s44, s23, 24
-; SI-NEXT:    s_lshr_b32 s45, s23, 16
-; SI-NEXT:    s_lshr_b32 s46, s23, 8
-; SI-NEXT:    s_lshr_b32 s47, s21, 24
-; SI-NEXT:    s_lshr_b32 s56, s21, 16
-; SI-NEXT:    s_lshr_b32 s57, s21, 8
-; SI-NEXT:    s_lshr_b32 s58, s19, 24
-; SI-NEXT:    s_lshr_b32 s59, s19, 16
-; SI-NEXT:    s_lshr_b32 s60, s19, 8
-; SI-NEXT:    s_lshr_b32 s61, s17, 24
-; SI-NEXT:    s_lshr_b32 s62, s17, 16
-; SI-NEXT:    s_lshr_b32 s63, s17, 8
+; SI-NEXT:    s_lshr_b32 s38, s5, 24
+; SI-NEXT:    s_lshr_b32 s39, s5, 16
+; SI-NEXT:    s_lshr_b32 s48, s5, 8
+; SI-NEXT:    s_lshr_b32 s49, s29, 24
+; SI-NEXT:    s_lshr_b32 s50, s29, 16
+; SI-NEXT:    s_lshr_b32 s51, s29, 8
+; SI-NEXT:    s_lshr_b32 s52, s27, 24
+; SI-NEXT:    s_lshr_b32 s53, s27, 16
+; SI-NEXT:    s_lshr_b32 s54, s27, 8
+; SI-NEXT:    s_lshr_b32 s55, s25, 24
+; SI-NEXT:    s_lshr_b32 s64, s25, 16
+; SI-NEXT:    s_lshr_b32 s65, s25, 8
+; SI-NEXT:    s_lshr_b32 s66, s23, 24
+; SI-NEXT:    s_lshr_b32 s67, s23, 16
+; SI-NEXT:    s_lshr_b32 s68, s23, 8
+; SI-NEXT:    s_lshr_b32 s69, s21, 24
+; SI-NEXT:    s_lshr_b32 s70, s21, 16
+; SI-NEXT:    s_lshr_b32 s71, s21, 8
+; SI-NEXT:    s_lshr_b32 s80, s19, 24
+; SI-NEXT:    s_lshr_b32 s81, s19, 16
+; SI-NEXT:    s_lshr_b32 s82, s19, 8
+; SI-NEXT:    s_lshr_b32 s83, s17, 24
+; SI-NEXT:    s_lshr_b32 s84, s17, 16
+; SI-NEXT:    s_lshr_b32 s85, s17, 8
+; SI-NEXT:    s_lshr_b64 s[6:7], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[8:9], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[4:5], 8
+; SI-NEXT:    s_lshr_b64 s[12:13], s[28:29], 24
+; SI-NEXT:    s_lshr_b64 s[14:15], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[28:29], 8
+; SI-NEXT:    s_lshr_b64 s[42:43], s[26:27], 24
+; SI-NEXT:    s_lshr_b64 s[44:45], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[26:27], 8
+; SI-NEXT:    s_lshr_b64 s[56:57], s[24:25], 24
+; SI-NEXT:    s_lshr_b64 s[58:59], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[24:25], 8
+; SI-NEXT:    s_lshr_b64 s[74:75], s[22:23], 24
+; SI-NEXT:    s_lshr_b64 s[78:79], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[88:89], s[22:23], 8
+; SI-NEXT:    s_lshr_b64 s[62:63], s[20:21], 24
+; SI-NEXT:    s_lshr_b64 s[72:73], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[20:21], 8
+; SI-NEXT:    s_lshr_b64 s[90:91], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[92:93], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[94:95], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[30:31], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[34:35], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[36:37], s[16:17], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB69_3
 ; SI-NEXT:  .LBB69_2: ; %cmp.true
-; SI-NEXT:    s_add_u32 s16, s16, 3
-; SI-NEXT:    s_addc_u32 s17, s17, 0
-; SI-NEXT:    s_add_u32 s18, s18, 3
-; SI-NEXT:    s_addc_u32 s19, s19, 0
-; SI-NEXT:    s_add_u32 s20, s20, 3
-; SI-NEXT:    s_addc_u32 s21, s21, 0
-; SI-NEXT:    s_add_u32 s22, s22, 3
-; SI-NEXT:    s_addc_u32 s23, s23, 0
-; SI-NEXT:    s_add_u32 s24, s24, 3
-; SI-NEXT:    s_addc_u32 s25, s25, 0
-; SI-NEXT:    s_add_u32 s26, s26, 3
-; SI-NEXT:    s_addc_u32 s27, s27, 0
+; SI-NEXT:    s_add_u32 s4, s4, 3
+; SI-NEXT:    s_addc_u32 s5, s5, 0
 ; SI-NEXT:    s_add_u32 s28, s28, 3
 ; SI-NEXT:    s_addc_u32 s29, s29, 0
-; SI-NEXT:    s_add_u32 s7, s7, 3
-; SI-NEXT:    s_addc_u32 s6, s6, 0
-; SI-NEXT:    v_mov_b32_e32 v3, s7
-; SI-NEXT:    v_mov_b32_e32 v6, s28
-; SI-NEXT:    v_mov_b32_e32 v9, s26
-; SI-NEXT:    v_mov_b32_e32 v10, s24
-; SI-NEXT:    v_mov_b32_e32 v14, s22
-; SI-NEXT:    v_mov_b32_e32 v18, s20
-; SI-NEXT:    v_mov_b32_e32 v21, s18
-; SI-NEXT:    v_mov_b32_e32 v22, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v3, 24
-; SI-NEXT:    v_alignbit_b32 v2, s6, v3, 16
-; SI-NEXT:    v_alignbit_b32 v3, s6, v3, 8
-; SI-NEXT:    v_alignbit_b32 v4, s29, v6, 24
-; SI-NEXT:    v_alignbit_b32 v5, s29, v6, 16
-; SI-NEXT:    v_alignbit_b32 v6, s29, v6, 8
-; SI-NEXT:    v_alignbit_b32 v7, s27, v9, 24
-; SI-NEXT:    v_alignbit_b32 v8, s27, v9, 16
-; SI-NEXT:    v_alignbit_b32 v9, s27, v9, 8
-; SI-NEXT:    v_alignbit_b32 v13, s25, v10, 24
-; SI-NEXT:    v_alignbit_b32 v15, s25, v10, 16
-; SI-NEXT:    v_alignbit_b32 v10, s25, v10, 8
-; SI-NEXT:    v_alignbit_b32 v11, s23, v14, 24
-; SI-NEXT:    v_alignbit_b32 v12, s23, v14, 16
-; SI-NEXT:    v_alignbit_b32 v14, s23, v14, 8
-; SI-NEXT:    v_alignbit_b32 v16, s21, v18, 24
-; SI-NEXT:    v_alignbit_b32 v17, s21, v18, 16
-; SI-NEXT:    v_alignbit_b32 v18, s21, v18, 8
-; SI-NEXT:    v_alignbit_b32 v19, s19, v21, 24
-; SI-NEXT:    v_alignbit_b32 v20, s19, v21, 16
-; SI-NEXT:    v_alignbit_b32 v21, s19, v21, 8
-; SI-NEXT:    v_alignbit_b32 v23, s17, v22, 24
-; SI-NEXT:    v_alignbit_b32 v24, s17, v22, 16
-; SI-NEXT:    v_alignbit_b32 v22, s17, v22, 8
-; SI-NEXT:    s_lshr_b32 s8, s6, 24
-; SI-NEXT:    s_lshr_b32 s9, s6, 16
-; SI-NEXT:    s_lshr_b32 s10, s6, 8
-; SI-NEXT:    s_lshr_b32 s11, s29, 24
-; SI-NEXT:    s_lshr_b32 s12, s29, 16
-; SI-NEXT:    s_lshr_b32 s13, s29, 8
-; SI-NEXT:    s_lshr_b32 s14, s27, 24
-; SI-NEXT:    s_lshr_b32 s15, s27, 16
-; SI-NEXT:    s_lshr_b32 s40, s27, 8
-; SI-NEXT:    s_lshr_b32 s41, s25, 24
-; SI-NEXT:    s_lshr_b32 s42, s25, 16
-; SI-NEXT:    s_lshr_b32 s43, s25, 8
-; SI-NEXT:    s_lshr_b32 s44, s23, 24
-; SI-NEXT:    s_lshr_b32 s45, s23, 16
-; SI-NEXT:    s_lshr_b32 s46, s23, 8
-; SI-NEXT:    s_lshr_b32 s47, s21, 24
-; SI-NEXT:    s_lshr_b32 s56, s21, 16
-; SI-NEXT:    s_lshr_b32 s57, s21, 8
-; SI-NEXT:    s_lshr_b32 s58, s19, 24
-; SI-NEXT:    s_lshr_b32 s59, s19, 16
-; SI-NEXT:    s_lshr_b32 s60, s19, 8
-; SI-NEXT:    s_lshr_b32 s61, s17, 24
-; SI-NEXT:    s_lshr_b32 s62, s17, 16
-; SI-NEXT:    s_lshr_b32 s63, s17, 8
+; SI-NEXT:    s_add_u32 s26, s26, 3
+; SI-NEXT:    s_addc_u32 s27, s27, 0
+; SI-NEXT:    s_add_u32 s24, s24, 3
+; SI-NEXT:    s_addc_u32 s25, s25, 0
+; SI-NEXT:    s_add_u32 s22, s22, 3
+; SI-NEXT:    s_addc_u32 s23, s23, 0
+; SI-NEXT:    s_add_u32 s20, s20, 3
+; SI-NEXT:    s_addc_u32 s21, s21, 0
+; SI-NEXT:    s_add_u32 s18, s18, 3
+; SI-NEXT:    s_addc_u32 s19, s19, 0
+; SI-NEXT:    s_add_u32 s16, s16, 3
+; SI-NEXT:    s_addc_u32 s17, s17, 0
+; SI-NEXT:    s_lshr_b32 s38, s5, 24
+; SI-NEXT:    s_lshr_b32 s39, s5, 16
+; SI-NEXT:    s_lshr_b32 s48, s5, 8
+; SI-NEXT:    s_lshr_b32 s49, s29, 24
+; SI-NEXT:    s_lshr_b32 s50, s29, 16
+; SI-NEXT:    s_lshr_b32 s51, s29, 8
+; SI-NEXT:    s_lshr_b32 s52, s27, 24
+; SI-NEXT:    s_lshr_b32 s53, s27, 16
+; SI-NEXT:    s_lshr_b32 s54, s27, 8
+; SI-NEXT:    s_lshr_b32 s55, s25, 24
+; SI-NEXT:    s_lshr_b32 s64, s25, 16
+; SI-NEXT:    s_lshr_b32 s65, s25, 8
+; SI-NEXT:    s_lshr_b32 s66, s23, 24
+; SI-NEXT:    s_lshr_b32 s67, s23, 16
+; SI-NEXT:    s_lshr_b32 s68, s23, 8
+; SI-NEXT:    s_lshr_b32 s69, s21, 24
+; SI-NEXT:    s_lshr_b32 s70, s21, 16
+; SI-NEXT:    s_lshr_b32 s71, s21, 8
+; SI-NEXT:    s_lshr_b32 s80, s19, 24
+; SI-NEXT:    s_lshr_b32 s81, s19, 16
+; SI-NEXT:    s_lshr_b32 s82, s19, 8
+; SI-NEXT:    s_lshr_b32 s83, s17, 24
+; SI-NEXT:    s_lshr_b32 s84, s17, 16
+; SI-NEXT:    s_lshr_b32 s85, s17, 8
+; SI-NEXT:    s_lshr_b64 s[6:7], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[8:9], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[4:5], 8
+; SI-NEXT:    s_lshr_b64 s[12:13], s[28:29], 24
+; SI-NEXT:    s_lshr_b64 s[14:15], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[28:29], 8
+; SI-NEXT:    s_lshr_b64 s[42:43], s[26:27], 24
+; SI-NEXT:    s_lshr_b64 s[44:45], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[26:27], 8
+; SI-NEXT:    s_lshr_b64 s[56:57], s[24:25], 24
+; SI-NEXT:    s_lshr_b64 s[58:59], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[24:25], 8
+; SI-NEXT:    s_lshr_b64 s[74:75], s[22:23], 24
+; SI-NEXT:    s_lshr_b64 s[78:79], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[88:89], s[22:23], 8
+; SI-NEXT:    s_lshr_b64 s[62:63], s[20:21], 24
+; SI-NEXT:    s_lshr_b64 s[72:73], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[20:21], 8
+; SI-NEXT:    s_lshr_b64 s[90:91], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[92:93], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[94:95], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[30:31], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[34:35], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[36:37], s[16:17], 8
 ; SI-NEXT:  .LBB69_3: ; %end
-; SI-NEXT:    s_and_b32 s4, s16, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 8, v22
-; SI-NEXT:    v_or_b32_e32 v22, s4, v22
-; SI-NEXT:    s_and_b32 s4, s17, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s63, 8
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s62, 0xff
-; SI-NEXT:    v_and_b32_e32 v24, 0xff, v24
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s61, 24
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 24, v23
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
-; SI-NEXT:    v_or_b32_e32 v23, v23, v24
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_or_b32_e32 v22, v22, v23
-; SI-NEXT:    v_mov_b32_e32 v23, s4
-; SI-NEXT:    s_and_b32 s4, s18, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 8, v21
-; SI-NEXT:    v_or_b32_e32 v21, s4, v21
-; SI-NEXT:    s_and_b32 s4, s19, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s60, 8
-; SI-NEXT:    v_and_b32_e32 v20, 0xff, v20
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s59, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 24, v19
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s58, 24
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
-; SI-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v22, vcc, 4, v0
-; SI-NEXT:    v_or_b32_e32 v19, v21, v19
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 8, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v23, v22, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v20, s4
-; SI-NEXT:    s_and_b32 s4, s20, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v18
-; SI-NEXT:    v_or_b32_e32 v18, s4, v18
-; SI-NEXT:    s_and_b32 s4, s21, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s57, 8
-; SI-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s56, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 24, v16
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s47, 24
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; SI-NEXT:    v_or_b32_e32 v16, v16, v17
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 12, v0
-; SI-NEXT:    v_or_b32_e32 v16, v18, v16
-; SI-NEXT:    v_add_i32_e32 v17, vcc, 16, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v20, v19, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v17, s4
-; SI-NEXT:    s_and_b32 s4, s22, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 8, v14
-; SI-NEXT:    v_or_b32_e32 v14, s4, v14
-; SI-NEXT:    s_and_b32 s4, s23, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s46, 8
-; SI-NEXT:    v_and_b32_e32 v12, 0xff, v12
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s45, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v11
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s44, 24
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; SI-NEXT:    v_or_b32_e32 v11, v11, v12
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
+; SI-NEXT:    s_lshl_b32 s7, s36, 8
+; SI-NEXT:    s_and_b32 s9, s16, 0xff
+; SI-NEXT:    s_or_b32 s7, s9, s7
+; SI-NEXT:    s_and_b32 s9, s34, 0xff
+; SI-NEXT:    s_lshl_b32 s11, s30, 24
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_or_b32 s9, s11, s9
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    s_and_b32 s7, s17, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s85, 8
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    s_and_b32 s9, s84, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s11, s83, 24
+; SI-NEXT:    s_or_b32 s9, s11, s9
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    s_lshl_b32 s7, s94, 8
+; SI-NEXT:    s_and_b32 s9, s18, 0xff
+; SI-NEXT:    s_or_b32 s7, s9, s7
+; SI-NEXT:    s_and_b32 s9, s92, 0xff
+; SI-NEXT:    s_lshl_b32 s11, s90, 24
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_or_b32 s9, s11, s9
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    v_mov_b32_e32 v3, s7
+; SI-NEXT:    s_and_b32 s7, s19, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s82, 8
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    s_and_b32 s9, s81, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s11, s80, 24
+; SI-NEXT:    s_or_b32 s9, s11, s9
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v16, vcc, 20, v0
-; SI-NEXT:    v_or_b32_e32 v11, v14, v11
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 24, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v17, v16, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v11, v12, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v12, s4
-; SI-NEXT:    s_and_b32 s4, s24, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 8, v10
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 28, v0
-; SI-NEXT:    v_or_b32_e32 v10, s4, v10
-; SI-NEXT:    s_and_b32 s4, s25, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s43, 8
-; SI-NEXT:    buffer_store_dword v12, v11, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v15
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s42, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    s_and_b32 s7, s20, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s76, 8
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    s_and_b32 s9, s72, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s11, s62, 24
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s9, s11, s9
+; SI-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    s_and_b32 s7, s21, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s71, 8
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    s_and_b32 s9, s70, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s11, s69, 24
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s9, s11, s9
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 24, v13
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s41, 24
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; SI-NEXT:    v_or_b32_e32 v11, v12, v11
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
-; SI-NEXT:    v_or_b32_e32 v10, v10, v11
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 32, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v11, s4
-; SI-NEXT:    s_and_b32 s4, s26, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; SI-NEXT:    v_or_b32_e32 v9, s4, v9
-; SI-NEXT:    s_and_b32 s4, s27, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s40, 8
-; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s15, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s14, s14, 24
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s14, s5
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    s_and_b32 s7, s22, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s88, 8
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    s_and_b32 s9, s78, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s11, s74, 24
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s9, s11, s9
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 20, v0
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    s_and_b32 s7, s23, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s68, 8
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    s_and_b32 s9, s67, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s11, s66, 24
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s9, s11, s9
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 24, v0
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    s_and_b32 s7, s24, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s60, 8
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    s_and_b32 s9, s58, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s11, s56, 24
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s9, s11, s9
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    s_and_b32 s7, s25, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s65, 8
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    s_and_b32 s9, s64, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s11, s55, 24
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s9, s11, s9
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v0
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 36, v0
-; SI-NEXT:    v_or_b32_e32 v7, v9, v7
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 40, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v11, v10, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v8, s4
-; SI-NEXT:    s_and_b32 s4, s28, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
-; SI-NEXT:    v_or_b32_e32 v6, s4, v6
-; SI-NEXT:    s_and_b32 s4, s29, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s13, 8
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s12, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s11, s11, 24
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; SI-NEXT:    v_or_b32_e32 v4, v4, v5
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    s_and_b32 s7, s26, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s46, 8
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    s_and_b32 s9, s44, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s11, s42, 24
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s9, s11, s9
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 36, v0
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    s_and_b32 s7, s27, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s54, 8
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    s_and_b32 s9, s53, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s11, s52, 24
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s9, s11, s9
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 40, v0
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    s_and_b32 s7, s28, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s40, 8
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    s_and_b32 s9, s14, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s11, s12, 24
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s9, s11, s9
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 44, v0
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    s_and_b32 s7, s29, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s51, 8
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    s_and_b32 s9, s50, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s11, s49, 24
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s9, s11, s9
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 48, v0
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s10, 8
+; SI-NEXT:    s_or_b32 s4, s4, s7
+; SI-NEXT:    s_and_b32 s7, s8, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_lshl_b32 s6, s6, 24
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s11, s5
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 52, v0
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 44, v0
-; SI-NEXT:    v_or_b32_e32 v4, v6, v4
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v8, v7, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v5, s4
-; SI-NEXT:    s_and_b32 s4, s7, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s6, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s10, 8
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    s_and_b32 s4, s5, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s48, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s9, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; SI-NEXT:    s_and_b32 s5, s39, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s6, s8, 24
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    s_lshl_b32 s6, s38, 24
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s6, s5
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 52, v0
-; SI-NEXT:    v_or_b32_e32 v1, v3, v1
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 56, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    v_readlane_b32 s85, v4, 29
+; SI-NEXT:    v_readlane_b32 s84, v4, 28
+; SI-NEXT:    v_readlane_b32 s83, v4, 27
+; SI-NEXT:    v_readlane_b32 s82, v4, 26
+; SI-NEXT:    v_readlane_b32 s81, v4, 25
+; SI-NEXT:    v_readlane_b32 s80, v4, 24
+; SI-NEXT:    v_readlane_b32 s71, v4, 23
+; SI-NEXT:    v_readlane_b32 s70, v4, 22
+; SI-NEXT:    v_readlane_b32 s69, v4, 21
+; SI-NEXT:    v_readlane_b32 s68, v4, 20
+; SI-NEXT:    v_readlane_b32 s67, v4, 19
+; SI-NEXT:    v_readlane_b32 s66, v4, 18
+; SI-NEXT:    v_readlane_b32 s65, v4, 17
+; SI-NEXT:    v_readlane_b32 s64, v4, 16
+; SI-NEXT:    v_readlane_b32 s55, v4, 15
+; SI-NEXT:    v_readlane_b32 s54, v4, 14
+; SI-NEXT:    v_readlane_b32 s53, v4, 13
+; SI-NEXT:    v_readlane_b32 s52, v4, 12
+; SI-NEXT:    v_readlane_b32 s51, v4, 11
+; SI-NEXT:    v_readlane_b32 s50, v4, 10
+; SI-NEXT:    v_readlane_b32 s49, v4, 9
+; SI-NEXT:    v_readlane_b32 s48, v4, 8
+; SI-NEXT:    v_readlane_b32 s39, v4, 7
+; SI-NEXT:    v_readlane_b32 s38, v4, 6
+; SI-NEXT:    v_readlane_b32 s37, v4, 5
+; SI-NEXT:    v_readlane_b32 s36, v4, 4
+; SI-NEXT:    v_readlane_b32 s35, v4, 3
+; SI-NEXT:    v_readlane_b32 s34, v4, 2
+; SI-NEXT:    v_readlane_b32 s31, v4, 1
+; SI-NEXT:    v_readlane_b32 s30, v4, 0
+; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB69_4:
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $sgpr63
+; SI-NEXT:    ; implicit-def: $sgpr36
+; SI-NEXT:    ; implicit-def: $sgpr34
+; SI-NEXT:    ; implicit-def: $sgpr30
+; SI-NEXT:    ; implicit-def: $sgpr85
+; SI-NEXT:    ; implicit-def: $sgpr84
+; SI-NEXT:    ; implicit-def: $sgpr83
+; SI-NEXT:    ; implicit-def: $sgpr94
+; SI-NEXT:    ; implicit-def: $sgpr92
+; SI-NEXT:    ; implicit-def: $sgpr90
+; SI-NEXT:    ; implicit-def: $sgpr82
+; SI-NEXT:    ; implicit-def: $sgpr81
+; SI-NEXT:    ; implicit-def: $sgpr80
+; SI-NEXT:    ; implicit-def: $sgpr76
+; SI-NEXT:    ; implicit-def: $sgpr72
 ; SI-NEXT:    ; implicit-def: $sgpr62
-; SI-NEXT:    ; implicit-def: $sgpr61
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    ; implicit-def: $vgpr19
+; SI-NEXT:    ; implicit-def: $sgpr71
+; SI-NEXT:    ; implicit-def: $sgpr70
+; SI-NEXT:    ; implicit-def: $sgpr69
+; SI-NEXT:    ; implicit-def: $sgpr68
+; SI-NEXT:    ; implicit-def: $sgpr67
+; SI-NEXT:    ; implicit-def: $sgpr66
+; SI-NEXT:    ; implicit-def: $sgpr65
+; SI-NEXT:    ; implicit-def: $sgpr64
+; SI-NEXT:    ; implicit-def: $sgpr55
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; implicit-def: $sgpr53
+; SI-NEXT:    ; implicit-def: $sgpr52
+; SI-NEXT:    ; implicit-def: $sgpr51
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; implicit-def: $sgpr49
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr39
+; SI-NEXT:    ; implicit-def: $sgpr38
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    ; implicit-def: $sgpr78
+; SI-NEXT:    ; implicit-def: $sgpr74
 ; SI-NEXT:    ; implicit-def: $sgpr60
-; SI-NEXT:    ; implicit-def: $sgpr59
 ; SI-NEXT:    ; implicit-def: $sgpr58
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $sgpr57
 ; SI-NEXT:    ; implicit-def: $sgpr56
-; SI-NEXT:    ; implicit-def: $sgpr47
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $vgpr11
 ; SI-NEXT:    ; implicit-def: $sgpr46
-; SI-NEXT:    ; implicit-def: $sgpr45
 ; SI-NEXT:    ; implicit-def: $sgpr44
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $sgpr43
 ; SI-NEXT:    ; implicit-def: $sgpr42
-; SI-NEXT:    ; implicit-def: $sgpr41
 ; SI-NEXT:    ; implicit-def: $sgpr40
-; SI-NEXT:    ; implicit-def: $sgpr15
 ; SI-NEXT:    ; implicit-def: $sgpr14
-; SI-NEXT:    ; implicit-def: $sgpr13
 ; SI-NEXT:    ; implicit-def: $sgpr12
-; SI-NEXT:    ; implicit-def: $sgpr11
 ; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $sgpr9
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr6
 ; SI-NEXT:    s_branch .LBB69_2
 ;
 ; VI-LABEL: bitcast_v8i64_to_v64i8_scalar:
@@ -44167,100 +44403,100 @@ define inreg <32 x i16> @bitcast_v8f64_to_v32i16_scalar(<8 x double> inreg %a, i
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; SI-NEXT:    v_mov_b32_e32 v33, v1
 ; SI-NEXT:    v_mov_b32_e32 v32, v0
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_mov_b32_e32 v1, s17
-; SI-NEXT:    v_mov_b32_e32 v4, s18
-; SI-NEXT:    v_mov_b32_e32 v5, s19
-; SI-NEXT:    v_mov_b32_e32 v8, s20
-; SI-NEXT:    v_mov_b32_e32 v9, s21
-; SI-NEXT:    v_mov_b32_e32 v12, s22
-; SI-NEXT:    v_mov_b32_e32 v13, s23
-; SI-NEXT:    v_mov_b32_e32 v16, s24
-; SI-NEXT:    v_mov_b32_e32 v17, s25
-; SI-NEXT:    v_mov_b32_e32 v20, s26
-; SI-NEXT:    v_mov_b32_e32 v21, s27
-; SI-NEXT:    v_mov_b32_e32 v24, s28
+; SI-NEXT:    v_mov_b32_e32 v34, s16
+; SI-NEXT:    v_mov_b32_e32 v35, s17
+; SI-NEXT:    v_mov_b32_e32 v36, s18
+; SI-NEXT:    v_mov_b32_e32 v37, s19
+; SI-NEXT:    v_mov_b32_e32 v38, s20
+; SI-NEXT:    v_mov_b32_e32 v39, s21
+; SI-NEXT:    v_mov_b32_e32 v48, s22
+; SI-NEXT:    v_mov_b32_e32 v49, s23
+; SI-NEXT:    v_mov_b32_e32 v50, s24
+; SI-NEXT:    v_mov_b32_e32 v51, s25
+; SI-NEXT:    v_mov_b32_e32 v52, s26
+; SI-NEXT:    v_mov_b32_e32 v53, s27
+; SI-NEXT:    v_mov_b32_e32 v54, s28
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mov_b32_e32 v25, s29
+; SI-NEXT:    v_mov_b32_e32 v55, s29
 ; SI-NEXT:    s_cbranch_scc0 .LBB73_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_alignbit_b32 v29, v33, v32, 16
-; SI-NEXT:    v_alignbit_b32 v48, v25, v24, 16
-; SI-NEXT:    v_alignbit_b32 v39, v21, v20, 16
-; SI-NEXT:    v_alignbit_b32 v38, v17, v16, 16
-; SI-NEXT:    v_alignbit_b32 v37, v13, v12, 16
-; SI-NEXT:    v_alignbit_b32 v36, v9, v8, 16
-; SI-NEXT:    v_alignbit_b32 v35, v5, v4, 16
-; SI-NEXT:    v_alignbit_b32 v34, v1, v0, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v33
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v21
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v55
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v53
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v51
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v49
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v39
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v37
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v35
+; SI-NEXT:    v_lshr_b64 v[29:30], v[32:33], 16
+; SI-NEXT:    v_lshr_b64 v[25:26], v[54:55], 16
+; SI-NEXT:    v_lshr_b64 v[21:22], v[52:53], 16
+; SI-NEXT:    v_lshr_b64 v[17:18], v[50:51], 16
+; SI-NEXT:    v_lshr_b64 v[13:14], v[48:49], 16
+; SI-NEXT:    v_lshr_b64 v[9:10], v[38:39], 16
+; SI-NEXT:    v_lshr_b64 v[5:6], v[36:37], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[34:35], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB73_3
 ; SI-NEXT:  .LBB73_2: ; %cmp.true
-; SI-NEXT:    v_add_f64 v[0:1], v[0:1], 1.0
-; SI-NEXT:    v_add_f64 v[4:5], v[4:5], 1.0
-; SI-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
-; SI-NEXT:    v_add_f64 v[12:13], v[12:13], 1.0
-; SI-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
-; SI-NEXT:    v_add_f64 v[20:21], v[20:21], 1.0
 ; SI-NEXT:    v_add_f64 v[32:33], v[32:33], 1.0
-; SI-NEXT:    v_add_f64 v[24:25], v[24:25], 1.0
-; SI-NEXT:    v_alignbit_b32 v29, v33, v32, 16
-; SI-NEXT:    v_alignbit_b32 v48, v25, v24, 16
-; SI-NEXT:    v_alignbit_b32 v39, v21, v20, 16
-; SI-NEXT:    v_alignbit_b32 v38, v17, v16, 16
-; SI-NEXT:    v_alignbit_b32 v37, v13, v12, 16
-; SI-NEXT:    v_alignbit_b32 v36, v9, v8, 16
-; SI-NEXT:    v_alignbit_b32 v35, v5, v4, 16
-; SI-NEXT:    v_alignbit_b32 v34, v1, v0, 16
+; SI-NEXT:    v_add_f64 v[54:55], v[54:55], 1.0
+; SI-NEXT:    v_add_f64 v[52:53], v[52:53], 1.0
+; SI-NEXT:    v_add_f64 v[50:51], v[50:51], 1.0
+; SI-NEXT:    v_add_f64 v[48:49], v[48:49], 1.0
+; SI-NEXT:    v_add_f64 v[38:39], v[38:39], 1.0
+; SI-NEXT:    v_add_f64 v[36:37], v[36:37], 1.0
+; SI-NEXT:    v_add_f64 v[34:35], v[34:35], 1.0
+; SI-NEXT:    v_lshr_b64 v[29:30], v[32:33], 16
+; SI-NEXT:    v_lshr_b64 v[25:26], v[54:55], 16
+; SI-NEXT:    v_lshr_b64 v[21:22], v[52:53], 16
+; SI-NEXT:    v_lshr_b64 v[17:18], v[50:51], 16
+; SI-NEXT:    v_lshr_b64 v[13:14], v[48:49], 16
+; SI-NEXT:    v_lshr_b64 v[9:10], v[38:39], 16
+; SI-NEXT:    v_lshr_b64 v[5:6], v[36:37], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[34:35], 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v33
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v21
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v55
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v53
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v51
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v49
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v39
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v37
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v35
 ; SI-NEXT:  .LBB73_3: ; %end
-; SI-NEXT:    v_mov_b32_e32 v2, v1
-; SI-NEXT:    v_mov_b32_e32 v6, v5
-; SI-NEXT:    v_mov_b32_e32 v10, v9
-; SI-NEXT:    v_mov_b32_e32 v14, v13
-; SI-NEXT:    v_mov_b32_e32 v18, v17
-; SI-NEXT:    v_mov_b32_e32 v22, v21
-; SI-NEXT:    v_mov_b32_e32 v26, v25
+; SI-NEXT:    v_mov_b32_e32 v0, v34
+; SI-NEXT:    v_mov_b32_e32 v2, v35
+; SI-NEXT:    v_mov_b32_e32 v4, v36
+; SI-NEXT:    v_mov_b32_e32 v6, v37
+; SI-NEXT:    v_mov_b32_e32 v8, v38
+; SI-NEXT:    v_mov_b32_e32 v10, v39
+; SI-NEXT:    v_mov_b32_e32 v12, v48
+; SI-NEXT:    v_mov_b32_e32 v14, v49
+; SI-NEXT:    v_mov_b32_e32 v16, v50
+; SI-NEXT:    v_mov_b32_e32 v18, v51
+; SI-NEXT:    v_mov_b32_e32 v20, v52
+; SI-NEXT:    v_mov_b32_e32 v22, v53
+; SI-NEXT:    v_mov_b32_e32 v24, v54
+; SI-NEXT:    v_mov_b32_e32 v26, v55
 ; SI-NEXT:    v_mov_b32_e32 v28, v32
 ; SI-NEXT:    v_mov_b32_e32 v30, v33
-; SI-NEXT:    v_mov_b32_e32 v1, v34
-; SI-NEXT:    v_mov_b32_e32 v5, v35
-; SI-NEXT:    v_mov_b32_e32 v9, v36
-; SI-NEXT:    v_mov_b32_e32 v13, v37
-; SI-NEXT:    v_mov_b32_e32 v17, v38
-; SI-NEXT:    v_mov_b32_e32 v21, v39
-; SI-NEXT:    v_mov_b32_e32 v25, v48
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB73_4:
-; SI-NEXT:    ; implicit-def: $vgpr34
+; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    ; implicit-def: $vgpr9
 ; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr37
+; SI-NEXT:    ; implicit-def: $vgpr13
 ; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr38
+; SI-NEXT:    ; implicit-def: $vgpr17
 ; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr39
+; SI-NEXT:    ; implicit-def: $vgpr21
 ; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr25
+; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    s_branch .LBB73_2
 ;
 ; VI-LABEL: bitcast_v8f64_to_v32i16_scalar:
@@ -51191,376 +51427,435 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
 ; SI-LABEL: bitcast_v8f64_to_v64i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_writelane_b32 v40, s30, 0
+; SI-NEXT:    v_writelane_b32 v40, s31, 1
+; SI-NEXT:    v_writelane_b32 v40, s34, 2
+; SI-NEXT:    v_writelane_b32 v40, s35, 3
+; SI-NEXT:    v_writelane_b32 v40, s36, 4
+; SI-NEXT:    v_writelane_b32 v40, s37, 5
+; SI-NEXT:    v_writelane_b32 v40, s38, 6
+; SI-NEXT:    v_writelane_b32 v40, s39, 7
+; SI-NEXT:    v_writelane_b32 v40, s48, 8
+; SI-NEXT:    v_writelane_b32 v40, s49, 9
+; SI-NEXT:    v_writelane_b32 v40, s50, 10
+; SI-NEXT:    v_writelane_b32 v40, s51, 11
+; SI-NEXT:    v_writelane_b32 v40, s52, 12
+; SI-NEXT:    v_writelane_b32 v40, s53, 13
+; SI-NEXT:    v_writelane_b32 v40, s54, 14
+; SI-NEXT:    v_writelane_b32 v40, s55, 15
+; SI-NEXT:    v_writelane_b32 v40, s64, 16
+; SI-NEXT:    v_writelane_b32 v40, s65, 17
+; SI-NEXT:    v_writelane_b32 v40, s66, 18
+; SI-NEXT:    v_writelane_b32 v40, s67, 19
+; SI-NEXT:    v_writelane_b32 v40, s68, 20
+; SI-NEXT:    v_writelane_b32 v40, s69, 21
+; SI-NEXT:    v_writelane_b32 v40, s70, 22
+; SI-NEXT:    v_writelane_b32 v40, s71, 23
+; SI-NEXT:    v_writelane_b32 v40, s80, 24
+; SI-NEXT:    v_writelane_b32 v40, s81, 25
+; SI-NEXT:    v_writelane_b32 v40, s82, 26
+; SI-NEXT:    v_writelane_b32 v40, s83, 27
+; SI-NEXT:    v_writelane_b32 v40, s84, 28
+; SI-NEXT:    v_writelane_b32 v40, s85, 29
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; SI-NEXT:    v_writelane_b32 v40, s86, 30
 ; SI-NEXT:    v_readfirstlane_b32 s4, v1
 ; SI-NEXT:    s_and_b64 s[6:7], vcc, exec
 ; SI-NEXT:    v_readfirstlane_b32 s5, v2
+; SI-NEXT:    v_writelane_b32 v40, s87, 31
 ; SI-NEXT:    s_cbranch_scc0 .LBB85_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v1, s4
-; SI-NEXT:    v_alignbit_b32 v2, s5, v1, 24
-; SI-NEXT:    v_alignbit_b32 v17, s5, v1, 16
-; SI-NEXT:    v_alignbit_b32 v18, s5, v1, 8
-; SI-NEXT:    v_mov_b32_e32 v1, s28
-; SI-NEXT:    v_alignbit_b32 v20, s29, v1, 24
-; SI-NEXT:    v_alignbit_b32 v4, s29, v1, 16
-; SI-NEXT:    v_alignbit_b32 v19, s29, v1, 8
-; SI-NEXT:    v_mov_b32_e32 v1, s26
-; SI-NEXT:    v_alignbit_b32 v6, s27, v1, 24
-; SI-NEXT:    v_alignbit_b32 v21, s27, v1, 16
-; SI-NEXT:    v_alignbit_b32 v22, s27, v1, 8
-; SI-NEXT:    v_mov_b32_e32 v1, s24
-; SI-NEXT:    v_alignbit_b32 v8, s25, v1, 24
-; SI-NEXT:    v_alignbit_b32 v23, s25, v1, 16
-; SI-NEXT:    v_alignbit_b32 v24, s25, v1, 8
-; SI-NEXT:    v_mov_b32_e32 v1, s22
-; SI-NEXT:    v_alignbit_b32 v10, s23, v1, 24
-; SI-NEXT:    v_alignbit_b32 v25, s23, v1, 16
-; SI-NEXT:    v_alignbit_b32 v26, s23, v1, 8
-; SI-NEXT:    v_mov_b32_e32 v1, s20
-; SI-NEXT:    v_alignbit_b32 v12, s21, v1, 24
-; SI-NEXT:    v_alignbit_b32 v14, s21, v1, 16
-; SI-NEXT:    v_alignbit_b32 v16, s21, v1, 8
-; SI-NEXT:    v_mov_b32_e32 v1, s18
-; SI-NEXT:    v_alignbit_b32 v27, s19, v1, 24
-; SI-NEXT:    v_alignbit_b32 v28, s19, v1, 16
-; SI-NEXT:    v_alignbit_b32 v29, s19, v1, 8
-; SI-NEXT:    v_mov_b32_e32 v1, s16
-; SI-NEXT:    v_alignbit_b32 v30, s17, v1, 24
-; SI-NEXT:    v_alignbit_b32 v31, s17, v1, 16
-; SI-NEXT:    v_alignbit_b32 v32, s17, v1, 8
-; SI-NEXT:    s_lshr_b32 s8, s5, 24
-; SI-NEXT:    s_lshr_b32 s9, s5, 16
-; SI-NEXT:    s_lshr_b32 s10, s5, 8
-; SI-NEXT:    s_lshr_b32 s11, s29, 24
-; SI-NEXT:    s_lshr_b32 s12, s29, 16
-; SI-NEXT:    s_lshr_b32 s13, s29, 8
-; SI-NEXT:    s_lshr_b32 s14, s27, 24
-; SI-NEXT:    s_lshr_b32 s15, s27, 16
-; SI-NEXT:    s_lshr_b32 s40, s27, 8
-; SI-NEXT:    s_lshr_b32 s41, s25, 24
-; SI-NEXT:    s_lshr_b32 s42, s25, 16
-; SI-NEXT:    s_lshr_b32 s43, s25, 8
-; SI-NEXT:    s_lshr_b32 s44, s23, 24
-; SI-NEXT:    s_lshr_b32 s45, s23, 16
-; SI-NEXT:    s_lshr_b32 s46, s23, 8
-; SI-NEXT:    s_lshr_b32 s47, s21, 24
-; SI-NEXT:    s_lshr_b32 s56, s21, 16
-; SI-NEXT:    s_lshr_b32 s57, s21, 8
-; SI-NEXT:    s_lshr_b32 s58, s19, 24
-; SI-NEXT:    s_lshr_b32 s59, s19, 16
-; SI-NEXT:    s_lshr_b32 s60, s19, 8
-; SI-NEXT:    s_lshr_b32 s61, s17, 24
-; SI-NEXT:    s_lshr_b32 s62, s17, 16
-; SI-NEXT:    s_lshr_b32 s63, s17, 8
+; SI-NEXT:    s_lshr_b32 s48, s5, 24
+; SI-NEXT:    s_lshr_b32 s49, s5, 16
+; SI-NEXT:    s_lshr_b32 s50, s5, 8
+; SI-NEXT:    s_lshr_b32 s51, s29, 24
+; SI-NEXT:    s_lshr_b32 s52, s29, 16
+; SI-NEXT:    s_lshr_b32 s53, s29, 8
+; SI-NEXT:    s_lshr_b32 s54, s27, 24
+; SI-NEXT:    s_lshr_b32 s55, s27, 16
+; SI-NEXT:    s_lshr_b32 s64, s27, 8
+; SI-NEXT:    s_lshr_b32 s65, s25, 24
+; SI-NEXT:    s_lshr_b32 s66, s25, 16
+; SI-NEXT:    s_lshr_b32 s67, s25, 8
+; SI-NEXT:    s_lshr_b32 s68, s23, 24
+; SI-NEXT:    s_lshr_b32 s69, s23, 16
+; SI-NEXT:    s_lshr_b32 s70, s23, 8
+; SI-NEXT:    s_lshr_b32 s71, s21, 24
+; SI-NEXT:    s_lshr_b32 s80, s21, 16
+; SI-NEXT:    s_lshr_b32 s81, s21, 8
+; SI-NEXT:    s_lshr_b32 s82, s19, 24
+; SI-NEXT:    s_lshr_b32 s83, s19, 16
+; SI-NEXT:    s_lshr_b32 s84, s19, 8
+; SI-NEXT:    s_lshr_b32 s85, s17, 24
+; SI-NEXT:    s_lshr_b32 s86, s17, 16
+; SI-NEXT:    s_lshr_b32 s87, s17, 8
+; SI-NEXT:    s_lshr_b64 s[6:7], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[10:11], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[4:5], 8
+; SI-NEXT:    s_lshr_b64 s[46:47], s[28:29], 24
+; SI-NEXT:    s_lshr_b64 s[60:61], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[72:73], s[28:29], 8
+; SI-NEXT:    s_lshr_b64 s[76:77], s[26:27], 24
+; SI-NEXT:    s_lshr_b64 s[78:79], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[88:89], s[26:27], 8
+; SI-NEXT:    s_lshr_b64 s[92:93], s[24:25], 24
+; SI-NEXT:    s_lshr_b64 s[94:95], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[30:31], s[24:25], 8
+; SI-NEXT:    s_lshr_b64 s[34:35], s[22:23], 24
+; SI-NEXT:    s_lshr_b64 s[36:37], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[38:39], s[22:23], 8
+; SI-NEXT:    s_lshr_b64 s[8:9], s[20:21], 24
+; SI-NEXT:    s_lshr_b64 s[12:13], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[20:21], 8
+; SI-NEXT:    s_lshr_b64 s[42:43], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[44:45], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[58:59], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[62:63], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[74:75], s[16:17], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB85_4
 ; SI-NEXT:  .LBB85_2: ; %cmp.true
-; SI-NEXT:    v_add_f64 v[15:16], s[16:17], 1.0
-; SI-NEXT:    v_add_f64 v[13:14], s[18:19], 1.0
-; SI-NEXT:    v_add_f64 v[11:12], s[20:21], 1.0
-; SI-NEXT:    v_add_f64 v[9:10], s[22:23], 1.0
-; SI-NEXT:    v_add_f64 v[7:8], s[24:25], 1.0
+; SI-NEXT:    v_add_f64 v[28:29], s[18:19], 1.0
 ; SI-NEXT:    v_add_f64 v[5:6], s[26:27], 1.0
-; SI-NEXT:    v_add_f64 v[1:2], s[4:5], 1.0
+; SI-NEXT:    v_add_f64 v[13:14], s[22:23], 1.0
 ; SI-NEXT:    v_add_f64 v[3:4], s[28:29], 1.0
-; SI-NEXT:    v_readfirstlane_b32 s5, v2
-; SI-NEXT:    v_readfirstlane_b32 s29, v4
-; SI-NEXT:    v_readfirstlane_b32 s27, v6
+; SI-NEXT:    v_lshr_b64 v[48:49], v[28:29], 24
+; SI-NEXT:    v_add_f64 v[1:2], s[4:5], 1.0
+; SI-NEXT:    v_add_f64 v[7:8], s[24:25], 1.0
+; SI-NEXT:    v_add_f64 v[20:21], s[20:21], 1.0
+; SI-NEXT:    v_add_f64 v[32:33], s[16:17], 1.0
+; SI-NEXT:    v_lshr_b64 v[22:23], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[34:35], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[49:50], v[28:29], 16
+; SI-NEXT:    v_lshr_b64 v[15:16], v[3:4], 24
+; SI-NEXT:    v_lshr_b64 v[23:24], v[5:6], 8
+; SI-NEXT:    v_lshr_b64 v[35:36], v[13:14], 8
+; SI-NEXT:    v_lshr_b64 v[50:51], v[28:29], 8
+; SI-NEXT:    v_lshr_b64 v[9:10], v[1:2], 24
+; SI-NEXT:    v_lshr_b64 v[16:17], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[24:25], v[7:8], 24
+; SI-NEXT:    v_lshr_b64 v[36:37], v[20:21], 24
+; SI-NEXT:    v_lshr_b64 v[51:52], v[32:33], 24
+; SI-NEXT:    v_lshr_b64 v[10:11], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[17:18], v[3:4], 8
+; SI-NEXT:    v_lshr_b64 v[25:26], v[7:8], 16
+; SI-NEXT:    v_lshr_b64 v[37:38], v[20:21], 16
+; SI-NEXT:    v_lshr_b64 v[52:53], v[32:33], 16
+; SI-NEXT:    v_readfirstlane_b32 s17, v33
+; SI-NEXT:    v_readfirstlane_b32 s19, v29
+; SI-NEXT:    v_readfirstlane_b32 s21, v21
+; SI-NEXT:    v_readfirstlane_b32 s23, v14
 ; SI-NEXT:    v_readfirstlane_b32 s25, v8
-; SI-NEXT:    v_readfirstlane_b32 s23, v10
-; SI-NEXT:    v_readfirstlane_b32 s21, v12
-; SI-NEXT:    v_readfirstlane_b32 s19, v14
-; SI-NEXT:    v_readfirstlane_b32 s17, v16
-; SI-NEXT:    v_alignbit_b32 v2, s5, v1, 24
-; SI-NEXT:    v_alignbit_b32 v17, s5, v1, 16
-; SI-NEXT:    v_alignbit_b32 v18, s5, v1, 8
-; SI-NEXT:    v_alignbit_b32 v20, s29, v3, 24
-; SI-NEXT:    v_alignbit_b32 v4, s29, v3, 16
-; SI-NEXT:    v_alignbit_b32 v19, s29, v3, 8
-; SI-NEXT:    v_alignbit_b32 v6, s27, v5, 24
-; SI-NEXT:    v_alignbit_b32 v21, s27, v5, 16
-; SI-NEXT:    v_alignbit_b32 v22, s27, v5, 8
-; SI-NEXT:    v_alignbit_b32 v8, s25, v7, 24
-; SI-NEXT:    v_alignbit_b32 v23, s25, v7, 16
-; SI-NEXT:    v_alignbit_b32 v24, s25, v7, 8
-; SI-NEXT:    v_alignbit_b32 v10, s23, v9, 24
-; SI-NEXT:    v_alignbit_b32 v25, s23, v9, 16
-; SI-NEXT:    v_alignbit_b32 v26, s23, v9, 8
-; SI-NEXT:    v_alignbit_b32 v12, s21, v11, 24
-; SI-NEXT:    s_lshr_b32 s8, s5, 24
-; SI-NEXT:    s_lshr_b32 s9, s5, 16
-; SI-NEXT:    s_lshr_b32 s10, s5, 8
-; SI-NEXT:    s_lshr_b32 s11, s29, 24
-; SI-NEXT:    s_lshr_b32 s12, s29, 16
-; SI-NEXT:    s_lshr_b32 s13, s29, 8
-; SI-NEXT:    s_lshr_b32 s14, s27, 24
-; SI-NEXT:    s_lshr_b32 s15, s27, 16
-; SI-NEXT:    s_lshr_b32 s40, s27, 8
-; SI-NEXT:    s_lshr_b32 s41, s25, 24
-; SI-NEXT:    s_lshr_b32 s42, s25, 16
-; SI-NEXT:    s_lshr_b32 s43, s25, 8
-; SI-NEXT:    s_lshr_b32 s44, s23, 24
-; SI-NEXT:    s_lshr_b32 s45, s23, 16
-; SI-NEXT:    s_lshr_b32 s46, s23, 8
-; SI-NEXT:    s_lshr_b32 s47, s21, 24
-; SI-NEXT:    s_lshr_b32 s56, s21, 16
-; SI-NEXT:    s_lshr_b32 s57, s21, 8
-; SI-NEXT:    s_lshr_b32 s58, s19, 24
-; SI-NEXT:    s_lshr_b32 s59, s19, 16
-; SI-NEXT:    s_lshr_b32 s60, s19, 8
-; SI-NEXT:    s_lshr_b32 s61, s17, 24
-; SI-NEXT:    s_lshr_b32 s62, s17, 16
-; SI-NEXT:    s_lshr_b32 s63, s17, 8
-; SI-NEXT:    v_alignbit_b32 v14, s21, v11, 16
-; SI-NEXT:    v_alignbit_b32 v16, s21, v11, 8
-; SI-NEXT:    v_alignbit_b32 v27, s19, v13, 24
-; SI-NEXT:    v_alignbit_b32 v28, s19, v13, 16
-; SI-NEXT:    v_alignbit_b32 v29, s19, v13, 8
-; SI-NEXT:    v_alignbit_b32 v30, s17, v15, 24
-; SI-NEXT:    v_alignbit_b32 v31, s17, v15, 16
-; SI-NEXT:    v_alignbit_b32 v32, s17, v15, 8
+; SI-NEXT:    v_readfirstlane_b32 s27, v6
+; SI-NEXT:    v_readfirstlane_b32 s29, v4
+; SI-NEXT:    v_readfirstlane_b32 s5, v2
+; SI-NEXT:    v_lshr_b64 v[11:12], v[1:2], 8
+; SI-NEXT:    v_lshr_b64 v[18:19], v[5:6], 24
+; SI-NEXT:    v_lshr_b64 v[26:27], v[7:8], 8
+; SI-NEXT:    v_lshr_b64 v[30:31], v[13:14], 24
+; SI-NEXT:    v_lshr_b64 v[38:39], v[20:21], 8
+; SI-NEXT:    v_lshr_b64 v[53:54], v[32:33], 8
+; SI-NEXT:    s_lshr_b32 s48, s5, 24
+; SI-NEXT:    s_lshr_b32 s49, s5, 16
+; SI-NEXT:    s_lshr_b32 s50, s5, 8
+; SI-NEXT:    s_lshr_b32 s51, s29, 24
+; SI-NEXT:    s_lshr_b32 s52, s29, 16
+; SI-NEXT:    s_lshr_b32 s53, s29, 8
+; SI-NEXT:    s_lshr_b32 s54, s27, 24
+; SI-NEXT:    s_lshr_b32 s55, s27, 16
+; SI-NEXT:    s_lshr_b32 s64, s27, 8
+; SI-NEXT:    s_lshr_b32 s65, s25, 24
+; SI-NEXT:    s_lshr_b32 s66, s25, 16
+; SI-NEXT:    s_lshr_b32 s67, s25, 8
+; SI-NEXT:    s_lshr_b32 s68, s23, 24
+; SI-NEXT:    s_lshr_b32 s69, s23, 16
+; SI-NEXT:    s_lshr_b32 s70, s23, 8
+; SI-NEXT:    s_lshr_b32 s71, s21, 24
+; SI-NEXT:    s_lshr_b32 s80, s21, 16
+; SI-NEXT:    s_lshr_b32 s81, s21, 8
+; SI-NEXT:    s_lshr_b32 s82, s19, 24
+; SI-NEXT:    s_lshr_b32 s83, s19, 16
+; SI-NEXT:    s_lshr_b32 s84, s19, 8
+; SI-NEXT:    s_lshr_b32 s85, s17, 24
+; SI-NEXT:    s_lshr_b32 s86, s17, 16
+; SI-NEXT:    s_lshr_b32 s87, s17, 8
 ; SI-NEXT:    s_branch .LBB85_5
 ; SI-NEXT:  .LBB85_3:
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    ; implicit-def: $sgpr63
+; SI-NEXT:    ; implicit-def: $sgpr74
 ; SI-NEXT:    ; implicit-def: $sgpr62
-; SI-NEXT:    ; implicit-def: $sgpr61
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $sgpr60
-; SI-NEXT:    ; implicit-def: $sgpr59
 ; SI-NEXT:    ; implicit-def: $sgpr58
-; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $sgpr57
+; SI-NEXT:    ; implicit-def: $sgpr87
+; SI-NEXT:    ; implicit-def: $sgpr86
+; SI-NEXT:    ; implicit-def: $sgpr85
 ; SI-NEXT:    ; implicit-def: $sgpr56
-; SI-NEXT:    ; implicit-def: $sgpr47
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $sgpr46
-; SI-NEXT:    ; implicit-def: $sgpr45
 ; SI-NEXT:    ; implicit-def: $sgpr44
-; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $sgpr43
 ; SI-NEXT:    ; implicit-def: $sgpr42
-; SI-NEXT:    ; implicit-def: $sgpr41
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    ; implicit-def: $sgpr84
+; SI-NEXT:    ; implicit-def: $sgpr83
+; SI-NEXT:    ; implicit-def: $sgpr82
 ; SI-NEXT:    ; implicit-def: $sgpr40
-; SI-NEXT:    ; implicit-def: $sgpr15
-; SI-NEXT:    ; implicit-def: $sgpr14
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $sgpr13
 ; SI-NEXT:    ; implicit-def: $sgpr12
-; SI-NEXT:    ; implicit-def: $sgpr11
-; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $sgpr9
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; implicit-def: $sgpr81
+; SI-NEXT:    ; implicit-def: $sgpr80
+; SI-NEXT:    ; implicit-def: $sgpr71
+; SI-NEXT:    ; implicit-def: $sgpr70
+; SI-NEXT:    ; implicit-def: $sgpr69
+; SI-NEXT:    ; implicit-def: $sgpr68
+; SI-NEXT:    ; implicit-def: $sgpr67
+; SI-NEXT:    ; implicit-def: $sgpr66
+; SI-NEXT:    ; implicit-def: $sgpr65
+; SI-NEXT:    ; implicit-def: $sgpr64
+; SI-NEXT:    ; implicit-def: $sgpr55
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; implicit-def: $sgpr53
+; SI-NEXT:    ; implicit-def: $sgpr52
+; SI-NEXT:    ; implicit-def: $sgpr51
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; implicit-def: $sgpr49
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr38
+; SI-NEXT:    ; implicit-def: $sgpr36
+; SI-NEXT:    ; implicit-def: $sgpr34
+; SI-NEXT:    ; implicit-def: $sgpr30
+; SI-NEXT:    ; implicit-def: $sgpr94
+; SI-NEXT:    ; implicit-def: $sgpr92
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    ; implicit-def: $sgpr78
+; SI-NEXT:    ; implicit-def: $sgpr76
+; SI-NEXT:    ; implicit-def: $sgpr72
+; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr6
 ; SI-NEXT:    s_branch .LBB85_2
 ; SI-NEXT:  .LBB85_4:
-; SI-NEXT:    v_mov_b32_e32 v1, s4
-; SI-NEXT:    v_mov_b32_e32 v3, s28
-; SI-NEXT:    v_mov_b32_e32 v5, s26
+; SI-NEXT:    v_mov_b32_e32 v32, s16
+; SI-NEXT:    v_mov_b32_e32 v28, s18
+; SI-NEXT:    v_mov_b32_e32 v20, s20
+; SI-NEXT:    v_mov_b32_e32 v13, s22
 ; SI-NEXT:    v_mov_b32_e32 v7, s24
-; SI-NEXT:    v_mov_b32_e32 v9, s22
-; SI-NEXT:    v_mov_b32_e32 v11, s20
-; SI-NEXT:    v_mov_b32_e32 v13, s18
-; SI-NEXT:    v_mov_b32_e32 v15, s16
+; SI-NEXT:    v_mov_b32_e32 v5, s26
+; SI-NEXT:    v_mov_b32_e32 v3, s28
+; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    v_mov_b32_e32 v53, s74
+; SI-NEXT:    v_mov_b32_e32 v52, s62
+; SI-NEXT:    v_mov_b32_e32 v51, s58
+; SI-NEXT:    v_mov_b32_e32 v50, s56
+; SI-NEXT:    v_mov_b32_e32 v49, s44
+; SI-NEXT:    v_mov_b32_e32 v48, s42
+; SI-NEXT:    v_mov_b32_e32 v38, s40
+; SI-NEXT:    v_mov_b32_e32 v37, s12
+; SI-NEXT:    v_mov_b32_e32 v36, s8
+; SI-NEXT:    v_mov_b32_e32 v35, s38
+; SI-NEXT:    v_mov_b32_e32 v34, s36
+; SI-NEXT:    v_mov_b32_e32 v30, s34
+; SI-NEXT:    v_mov_b32_e32 v26, s30
+; SI-NEXT:    v_mov_b32_e32 v25, s94
+; SI-NEXT:    v_mov_b32_e32 v24, s92
+; SI-NEXT:    v_mov_b32_e32 v23, s88
+; SI-NEXT:    v_mov_b32_e32 v22, s78
+; SI-NEXT:    v_mov_b32_e32 v18, s76
+; SI-NEXT:    v_mov_b32_e32 v17, s72
+; SI-NEXT:    v_mov_b32_e32 v16, s60
+; SI-NEXT:    v_mov_b32_e32 v15, s46
+; SI-NEXT:    v_mov_b32_e32 v11, s14
+; SI-NEXT:    v_mov_b32_e32 v10, s10
+; SI-NEXT:    v_mov_b32_e32 v9, s6
 ; SI-NEXT:  .LBB85_5: ; %end
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v32
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v53
 ; SI-NEXT:    s_and_b32 s4, s17, 0xff
-; SI-NEXT:    s_lshl_b32 s6, s63, 8
-; SI-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 8, v32
-; SI-NEXT:    v_and_b32_e32 v31, 0xff, v31
+; SI-NEXT:    s_lshl_b32 s6, s87, 8
+; SI-NEXT:    v_or_b32_e32 v2, v2, v4
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v52
 ; SI-NEXT:    s_or_b32 s4, s4, s6
-; SI-NEXT:    s_and_b32 s6, s62, 0xff
-; SI-NEXT:    v_or_b32_e32 v15, v15, v32
-; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
-; SI-NEXT:    v_lshlrev_b32_e32 v30, 24, v30
+; SI-NEXT:    s_and_b32 s6, s86, 0xff
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v51
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    s_lshl_b32 s7, s61, 24
-; SI-NEXT:    v_or_b32_e32 v30, v30, v31
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; SI-NEXT:    s_lshl_b32 s7, s85, 24
+; SI-NEXT:    v_or_b32_e32 v4, v6, v4
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s6, s7, s6
-; SI-NEXT:    v_or_b32_e32 v15, v15, v30
+; SI-NEXT:    v_or_b32_e32 v2, v2, v4
 ; SI-NEXT:    s_or_b32 s4, s4, s6
-; SI-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v15, vcc, 4, v0
-; SI-NEXT:    v_mov_b32_e32 v30, s4
-; SI-NEXT:    buffer_store_dword v30, v15, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 8, v29
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
+; SI-NEXT:    v_mov_b32_e32 v4, s4
+; SI-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v28
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v50
 ; SI-NEXT:    s_and_b32 s4, s19, 0xff
-; SI-NEXT:    s_lshl_b32 s6, s60, 8
-; SI-NEXT:    v_or_b32_e32 v13, v13, v15
-; SI-NEXT:    v_and_b32_e32 v15, 0xff, v28
+; SI-NEXT:    s_lshl_b32 s6, s84, 8
+; SI-NEXT:    v_or_b32_e32 v2, v2, v4
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v49
 ; SI-NEXT:    s_or_b32 s4, s4, s6
-; SI-NEXT:    s_and_b32 s6, s59, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 24, v27
+; SI-NEXT:    s_and_b32 s6, s83, 0xff
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v48
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    s_lshl_b32 s7, s58, 24
-; SI-NEXT:    v_or_b32_e32 v15, v27, v15
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; SI-NEXT:    s_lshl_b32 s7, s82, 24
+; SI-NEXT:    v_or_b32_e32 v4, v6, v4
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s6, s7, s6
-; SI-NEXT:    v_or_b32_e32 v13, v13, v15
-; SI-NEXT:    v_add_i32_e32 v15, vcc, 8, v0
+; SI-NEXT:    v_or_b32_e32 v2, v2, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 8, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s6
-; SI-NEXT:    buffer_store_dword v13, v15, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 12, v0
-; SI-NEXT:    v_mov_b32_e32 v15, s4
-; SI-NEXT:    buffer_store_dword v15, v13, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 8, v16
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 12, v0
+; SI-NEXT:    v_mov_b32_e32 v4, s4
+; SI-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v20
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v38
 ; SI-NEXT:    s_and_b32 s4, s21, 0xff
-; SI-NEXT:    s_lshl_b32 s6, s57, 8
-; SI-NEXT:    v_or_b32_e32 v11, v11, v13
-; SI-NEXT:    v_and_b32_e32 v13, 0xff, v14
+; SI-NEXT:    s_lshl_b32 s6, s81, 8
+; SI-NEXT:    v_or_b32_e32 v2, v2, v4
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v37
 ; SI-NEXT:    s_or_b32 s4, s4, s6
-; SI-NEXT:    s_and_b32 s6, s56, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 24, v12
+; SI-NEXT:    s_and_b32 s6, s80, 0xff
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v36
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    s_lshl_b32 s7, s47, 24
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; SI-NEXT:    v_or_b32_e32 v12, v12, v13
+; SI-NEXT:    s_lshl_b32 s7, s71, 24
+; SI-NEXT:    v_or_b32_e32 v4, v6, v4
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s6, s7, s6
-; SI-NEXT:    v_or_b32_e32 v11, v11, v12
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 16, v0
+; SI-NEXT:    v_or_b32_e32 v2, v2, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 16, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s6
-; SI-NEXT:    buffer_store_dword v11, v12, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 20, v0
-; SI-NEXT:    v_mov_b32_e32 v12, s4
-; SI-NEXT:    buffer_store_dword v12, v11, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v26
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 20, v0
+; SI-NEXT:    v_mov_b32_e32 v4, s4
+; SI-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v13
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v35
 ; SI-NEXT:    s_and_b32 s4, s23, 0xff
-; SI-NEXT:    s_lshl_b32 s6, s46, 8
-; SI-NEXT:    v_or_b32_e32 v9, v9, v11
-; SI-NEXT:    v_and_b32_e32 v11, 0xff, v25
+; SI-NEXT:    s_lshl_b32 s6, s70, 8
+; SI-NEXT:    v_or_b32_e32 v2, v2, v4
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v34
 ; SI-NEXT:    s_or_b32 s4, s4, s6
-; SI-NEXT:    s_and_b32 s6, s45, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
+; SI-NEXT:    s_and_b32 s6, s69, 0xff
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v30
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    s_lshl_b32 s7, s44, 24
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; SI-NEXT:    v_or_b32_e32 v10, v10, v11
+; SI-NEXT:    s_lshl_b32 s7, s68, 24
+; SI-NEXT:    v_or_b32_e32 v4, v6, v4
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s6, s7, s6
-; SI-NEXT:    v_or_b32_e32 v9, v9, v10
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 24, v0
+; SI-NEXT:    v_or_b32_e32 v2, v2, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 24, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s6
-; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 28, v0
-; SI-NEXT:    v_mov_b32_e32 v10, s4
-; SI-NEXT:    buffer_store_dword v10, v9, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v24
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
+; SI-NEXT:    v_mov_b32_e32 v4, s4
+; SI-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v7
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v26
 ; SI-NEXT:    s_and_b32 s4, s25, 0xff
-; SI-NEXT:    s_lshl_b32 s6, s43, 8
-; SI-NEXT:    v_or_b32_e32 v7, v7, v9
-; SI-NEXT:    v_and_b32_e32 v9, 0xff, v23
+; SI-NEXT:    s_lshl_b32 s6, s67, 8
+; SI-NEXT:    v_or_b32_e32 v2, v2, v4
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v25
 ; SI-NEXT:    s_or_b32 s4, s4, s6
-; SI-NEXT:    s_and_b32 s6, s42, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 24, v8
+; SI-NEXT:    s_and_b32 s6, s66, 0xff
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v24
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    s_lshl_b32 s7, s41, 24
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; SI-NEXT:    v_or_b32_e32 v8, v8, v9
+; SI-NEXT:    s_lshl_b32 s7, s65, 24
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_or_b32_e32 v4, v6, v4
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s6, s7, s6
-; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 32, v0
+; SI-NEXT:    v_or_b32_e32 v2, v2, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 32, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s6
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 36, v0
-; SI-NEXT:    v_mov_b32_e32 v8, s4
-; SI-NEXT:    buffer_store_dword v8, v7, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v22
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 36, v0
+; SI-NEXT:    v_mov_b32_e32 v4, s4
+; SI-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v5
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v23
 ; SI-NEXT:    s_and_b32 s4, s27, 0xff
-; SI-NEXT:    s_lshl_b32 s6, s40, 8
-; SI-NEXT:    v_or_b32_e32 v5, v5, v7
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v21
+; SI-NEXT:    s_lshl_b32 s6, s64, 8
+; SI-NEXT:    v_or_b32_e32 v2, v2, v4
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v22
 ; SI-NEXT:    s_or_b32 s4, s4, s6
-; SI-NEXT:    s_and_b32 s6, s15, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
+; SI-NEXT:    s_and_b32 s6, s55, 0xff
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v18
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    s_lshl_b32 s7, s14, 24
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; SI-NEXT:    v_or_b32_e32 v6, v6, v7
+; SI-NEXT:    s_lshl_b32 s7, s54, 24
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_or_b32_e32 v4, v5, v4
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s6, s7, s6
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 40, v0
+; SI-NEXT:    v_or_b32_e32 v2, v2, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 40, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s6
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 44, v0
-; SI-NEXT:    v_mov_b32_e32 v6, s4
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
+; SI-NEXT:    v_mov_b32_e32 v4, s4
+; SI-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v17
 ; SI-NEXT:    s_and_b32 s4, s29, 0xff
-; SI-NEXT:    s_lshl_b32 s6, s13, 8
-; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v19
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; SI-NEXT:    s_lshl_b32 s6, s53, 8
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v16
 ; SI-NEXT:    s_or_b32 s4, s4, s6
-; SI-NEXT:    s_and_b32 s6, s12, 0xff
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v20
+; SI-NEXT:    s_and_b32 s6, s52, 0xff
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v15
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    s_lshl_b32 s7, s11, 24
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v4, v5, v4
+; SI-NEXT:    s_lshl_b32 s7, s51, 24
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_or_b32_e32 v3, v4, v3
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s6, s7, s6
-; SI-NEXT:    v_or_b32_e32 v3, v3, v4
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 48, v0
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s6
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
-; SI-NEXT:    v_mov_b32_e32 v4, s4
-; SI-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 52, v0
+; SI-NEXT:    v_mov_b32_e32 v3, s4
+; SI-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v11
 ; SI-NEXT:    s_and_b32 s4, s5, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s10, 8
-; SI-NEXT:    v_or_b32_e32 v1, v1, v3
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v17
+; SI-NEXT:    s_lshl_b32 s5, s50, 8
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v10
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s9, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; SI-NEXT:    s_and_b32 s5, s49, 0xff
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v9
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s6, s8, 24
+; SI-NEXT:    s_lshl_b32 s6, s48, 24
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s6, s5
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -51571,6 +51866,41 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    v_readlane_b32 s87, v40, 31
+; SI-NEXT:    v_readlane_b32 s86, v40, 30
+; SI-NEXT:    v_readlane_b32 s85, v40, 29
+; SI-NEXT:    v_readlane_b32 s84, v40, 28
+; SI-NEXT:    v_readlane_b32 s83, v40, 27
+; SI-NEXT:    v_readlane_b32 s82, v40, 26
+; SI-NEXT:    v_readlane_b32 s81, v40, 25
+; SI-NEXT:    v_readlane_b32 s80, v40, 24
+; SI-NEXT:    v_readlane_b32 s71, v40, 23
+; SI-NEXT:    v_readlane_b32 s70, v40, 22
+; SI-NEXT:    v_readlane_b32 s69, v40, 21
+; SI-NEXT:    v_readlane_b32 s68, v40, 20
+; SI-NEXT:    v_readlane_b32 s67, v40, 19
+; SI-NEXT:    v_readlane_b32 s66, v40, 18
+; SI-NEXT:    v_readlane_b32 s65, v40, 17
+; SI-NEXT:    v_readlane_b32 s64, v40, 16
+; SI-NEXT:    v_readlane_b32 s55, v40, 15
+; SI-NEXT:    v_readlane_b32 s54, v40, 14
+; SI-NEXT:    v_readlane_b32 s53, v40, 13
+; SI-NEXT:    v_readlane_b32 s52, v40, 12
+; SI-NEXT:    v_readlane_b32 s51, v40, 11
+; SI-NEXT:    v_readlane_b32 s50, v40, 10
+; SI-NEXT:    v_readlane_b32 s49, v40, 9
+; SI-NEXT:    v_readlane_b32 s48, v40, 8
+; SI-NEXT:    v_readlane_b32 s39, v40, 7
+; SI-NEXT:    v_readlane_b32 s38, v40, 6
+; SI-NEXT:    v_readlane_b32 s37, v40, 5
+; SI-NEXT:    v_readlane_b32 s36, v40, 4
+; SI-NEXT:    v_readlane_b32 s35, v40, 3
+; SI-NEXT:    v_readlane_b32 s34, v40, 2
+; SI-NEXT:    v_readlane_b32 s31, v40, 1
+; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -58201,95 +58531,120 @@ define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i
 ; SI-LABEL: bitcast_v32f16_to_v32i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v31, v17
-; SI-NEXT:    v_mov_b32_e32 v30, v16
-; SI-NEXT:    v_mov_b32_e32 v29, v15
-; SI-NEXT:    v_mov_b32_e32 v28, v14
-; SI-NEXT:    v_mov_b32_e32 v15, v1
-; SI-NEXT:    v_mov_b32_e32 v14, v0
+; SI-NEXT:    v_mov_b32_e32 v21, v16
+; SI-NEXT:    v_mov_b32_e32 v25, v15
+; SI-NEXT:    v_mov_b32_e32 v26, v12
+; SI-NEXT:    v_mov_b32_e32 v29, v11
+; SI-NEXT:    v_mov_b32_e32 v22, v8
+; SI-NEXT:    v_mov_b32_e32 v30, v7
+; SI-NEXT:    v_mov_b32_e32 v32, v4
+; SI-NEXT:    v_mov_b32_e32 v33, v3
+; SI-NEXT:    v_mov_b32_e32 v34, v0
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, s16
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, s17
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, s19
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v16, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v19, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v22, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v23, v9
 ; SI-NEXT:    v_cvt_f16_f32_e32 v24, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v26, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v27, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
-; SI-NEXT:    v_cvt_f16_f32_e32 v29, v29
-; SI-NEXT:    v_cvt_f16_f32_e32 v30, v30
-; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, s18
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, s19
+; SI-NEXT:    v_cvt_f16_f32_e32 v28, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, s21
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, s22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s23
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, s24
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, s25
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, s26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, s27
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, s28
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, s29
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v34
+; SI-NEXT:    v_cvt_f16_f32_e32 v48, v33
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v32
+; SI-NEXT:    v_cvt_f16_f32_e32 v49, v30
+; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v29
+; SI-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v25
+; SI-NEXT:    v_cvt_f16_f32_e32 v30, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v37, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v35, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, s22
+; SI-NEXT:    v_cvt_f16_f32_e32 v33, s25
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, s26
+; SI-NEXT:    v_cvt_f16_f32_e32 v38, s29
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_cbranch_scc0 .LBB91_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_cbranch_execnz .LBB91_3
 ; SI-NEXT:  .LBB91_2: ; %cmp.true
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v30, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
-; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
-; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
-; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
-; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; SI-NEXT:    v_add_f32_e32 v31, 0x38000000, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
 ; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; SI-NEXT:    v_add_f32_e32 v30, 0x38000000, v30
-; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v30, v30
-; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
-; SI-NEXT:    v_add_f32_e32 v26, 0x38000000, v26
-; SI-NEXT:    v_add_f32_e32 v23, 0x38000000, v23
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v22
-; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
+; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
+; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
+; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
 ; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v15
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
-; SI-NEXT:    v_cvt_f32_f16_e32 v29, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v31
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
 ; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
+; SI-NEXT:    v_add_f32_e32 v23, 0x38000000, v23
+; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
+; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v22
+; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; SI-NEXT:    v_add_f32_e32 v26, 0x38000000, v26
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; SI-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v15
+; SI-NEXT:    v_or_b32_e32 v14, v14, v17
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v48
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v19
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v37
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v11
+; SI-NEXT:    v_or_b32_e32 v18, v18, v21
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, v49
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v23
+; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_or_b32_e32 v2, v2, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v35
+; SI-NEXT:    v_or_b32_e32 v6, v6, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v33
+; SI-NEXT:    v_or_b32_e32 v10, v10, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v38
+; SI-NEXT:    v_or_b32_e32 v22, v22, v25
+; SI-NEXT:    v_cvt_f32_f16_e32 v25, v52
+; SI-NEXT:    v_or_b32_e32 v26, v26, v29
+; SI-NEXT:    v_cvt_f32_f16_e32 v29, v50
+; SI-NEXT:    v_cvt_f32_f16_e32 v30, v30
+; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
@@ -58298,52 +58653,34 @@ define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
 ; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
 ; SI-NEXT:    v_cvt_f32_f16_e32 v28, v28
-; SI-NEXT:    v_or_b32_e32 v30, v30, v32
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v27
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
-; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
-; SI-NEXT:    v_or_b32_e32 v26, v26, v32
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v23
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
-; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_or_b32_e32 v22, v22, v32
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
+; SI-NEXT:    v_add_f32_e32 v31, 0x38000000, v31
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
 ; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
-; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
-; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
 ; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v25
 ; SI-NEXT:    v_add_f32_e32 v29, 0x38000000, v29
-; SI-NEXT:    v_or_b32_e32 v18, v18, v32
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_add_f32_e32 v30, 0x38000000, v30
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; SI-NEXT:    v_cvt_f16_f32_e32 v30, v30
+; SI-NEXT:    v_cvt_f16_f32_e32 v29, v29
+; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
 ; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
 ; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
 ; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v29, v29
 ; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
-; SI-NEXT:    v_or_b32_e32 v14, v14, v32
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
@@ -58352,19 +58689,23 @@ define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
-; SI-NEXT:    v_or_b32_e32 v10, v10, v32
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v7
-; SI-NEXT:    v_or_b32_e32 v6, v6, v32
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v3
-; SI-NEXT:    v_or_b32_e32 v2, v2, v32
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v31
+; SI-NEXT:    v_lshr_b64 v[48:49], v[17:18], 16
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
 ; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; SI-NEXT:    v_or_b32_e32 v30, v30, v32
 ; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; SI-NEXT:    v_lshr_b64 v[37:38], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[49:50], v[21:22], 16
+; SI-NEXT:    v_lshr_b64 v[35:36], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[33:34], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[38:39], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[52:53], v[25:26], 16
+; SI-NEXT:    v_lshr_b64 v[50:51], v[29:30], 16
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    v_or_b32_e32 v4, v4, v5
 ; SI-NEXT:    v_or_b32_e32 v8, v8, v9
@@ -58373,15 +58714,15 @@ define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i
 ; SI-NEXT:    v_or_b32_e32 v20, v20, v21
 ; SI-NEXT:    v_or_b32_e32 v24, v24, v25
 ; SI-NEXT:    v_or_b32_e32 v28, v28, v29
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v9, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v13, v14, v13, 16
-; SI-NEXT:    v_alignbit_b32 v17, v18, v17, 16
-; SI-NEXT:    v_alignbit_b32 v21, v22, v21, 16
-; SI-NEXT:    v_alignbit_b32 v25, v26, v25, 16
-; SI-NEXT:    v_alignbit_b32 v29, v30, v29, 16
 ; SI-NEXT:  .LBB91_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v1, v37
+; SI-NEXT:    v_mov_b32_e32 v5, v35
+; SI-NEXT:    v_mov_b32_e32 v9, v33
+; SI-NEXT:    v_mov_b32_e32 v13, v38
+; SI-NEXT:    v_mov_b32_e32 v17, v48
+; SI-NEXT:    v_mov_b32_e32 v21, v49
+; SI-NEXT:    v_mov_b32_e32 v25, v52
+; SI-NEXT:    v_mov_b32_e32 v29, v50
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB91_4:
 ; SI-NEXT:    s_branch .LBB91_2
@@ -60786,185 +61127,185 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
 ; SI-NEXT:    s_waitcnt expcnt(6)
 ; SI-NEXT:    v_mul_f32_e64 v57, 1.0, s16
 ; SI-NEXT:    v_mul_f32_e64 v56, 1.0, s17
-; SI-NEXT:    v_mul_f32_e32 v35, 1.0, v0
-; SI-NEXT:    v_mul_f32_e32 v34, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v49, 1.0, v0
+; SI-NEXT:    v_mul_f32_e32 v39, 1.0, v1
 ; SI-NEXT:    v_mul_f32_e32 v47, 1.0, v2
 ; SI-NEXT:    v_mul_f32_e32 v46, 1.0, v3
-; SI-NEXT:    v_mul_f32_e32 v39, 1.0, v4
-; SI-NEXT:    v_mul_f32_e32 v38, 1.0, v5
+; SI-NEXT:    v_mul_f32_e32 v53, 1.0, v4
+; SI-NEXT:    v_mul_f32_e32 v52, 1.0, v5
 ; SI-NEXT:    v_mul_f32_e32 v45, 1.0, v6
 ; SI-NEXT:    v_mul_f32_e32 v44, 1.0, v7
-; SI-NEXT:    v_mul_f32_e32 v51, 1.0, v8
-; SI-NEXT:    v_mul_f32_e32 v50, 1.0, v9
+; SI-NEXT:    v_mul_f32_e32 v55, 1.0, v8
+; SI-NEXT:    v_mul_f32_e32 v21, 1.0, v9
 ; SI-NEXT:    v_mul_f32_e32 v43, 1.0, v10
 ; SI-NEXT:    v_mul_f32_e32 v42, 1.0, v11
-; SI-NEXT:    v_mul_f32_e32 v53, 1.0, v12
-; SI-NEXT:    v_mul_f32_e32 v52, 1.0, v13
+; SI-NEXT:    v_mul_f32_e32 v25, 1.0, v12
+; SI-NEXT:    v_mul_f32_e32 v13, 1.0, v13
 ; SI-NEXT:    v_mul_f32_e32 v41, 1.0, v14
 ; SI-NEXT:    v_mul_f32_e32 v40, 1.0, v15
-; SI-NEXT:    v_mul_f32_e32 v55, 1.0, v16
-; SI-NEXT:    v_mul_f32_e32 v54, 1.0, v17
-; SI-NEXT:    v_mul_f32_e64 v33, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v32, 1.0, s19
+; SI-NEXT:    v_mul_f32_e32 v29, 1.0, v16
+; SI-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; SI-NEXT:    v_mul_f32_e64 v32, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s19
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mul_f32_e64 v63, 1.0, s20
 ; SI-NEXT:    v_mul_f32_e64 v62, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v37, 1.0, s22
-; SI-NEXT:    v_mul_f32_e64 v36, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v51, 1.0, s22
+; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s23
 ; SI-NEXT:    v_mul_f32_e64 v61, 1.0, s24
 ; SI-NEXT:    v_mul_f32_e64 v60, 1.0, s25
-; SI-NEXT:    v_mul_f32_e64 v49, 1.0, s26
-; SI-NEXT:    v_mul_f32_e64 v48, 1.0, s27
+; SI-NEXT:    v_mul_f32_e64 v54, 1.0, s26
+; SI-NEXT:    v_mul_f32_e64 v9, 1.0, s27
 ; SI-NEXT:    v_mul_f32_e64 v59, 1.0, s28
 ; SI-NEXT:    v_mul_f32_e64 v58, 1.0, s29
 ; SI-NEXT:    s_cbranch_scc0 .LBB95_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v57
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v56
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v33
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v32
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v56
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v32
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v63
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v62
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v37
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v36
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v62
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v51
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
 ; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v61
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v60
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v49
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v48
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v60
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v54
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
 ; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v59
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v58
-; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v35
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v34
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v39
 ; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v47
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v46
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v39
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v38
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v52
 ; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v45
-; SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v44
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v51
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v50
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v21
 ; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v43
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v42
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v53
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v52
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v13
 ; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v41
-; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v40
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v55
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v54
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v58
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v49
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v46
+; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v53
+; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v44
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v55
+; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v42
+; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v25
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v40
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v29
 ; SI-NEXT:    s_cbranch_execnz .LBB95_3
 ; SI-NEXT:  .LBB95_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v56
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v56
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v57
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v28
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v62
 ; SI-NEXT:    v_alignbit_b32 v0, v2, v0, 16
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v63
 ; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v60
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v60
 ; SI-NEXT:    v_alignbit_b32 v4, v4, v2, 16
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v61
-; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
-; SI-NEXT:    v_alignbit_b32 v8, v6, v2, 16
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v58
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v59
 ; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; SI-NEXT:    v_alignbit_b32 v12, v7, v2, 16
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v46
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v47
+; SI-NEXT:    v_alignbit_b32 v8, v7, v2, 16
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v58
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v59
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
-; SI-NEXT:    v_alignbit_b32 v16, v9, v2, 16
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v44
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
+; SI-NEXT:    v_alignbit_b32 v12, v10, v2, 16
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v46
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v47
+; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
+; SI-NEXT:    v_alignbit_b32 v16, v11, v2, 16
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v44
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v45
-; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
+; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v9
-; SI-NEXT:    v_alignbit_b32 v20, v10, v2, 16
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v42
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v11
+; SI-NEXT:    v_alignbit_b32 v20, v14, v2, 16
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v42
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v43
-; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; SI-NEXT:    v_alignbit_b32 v24, v11, v2, 16
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v40
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v14
+; SI-NEXT:    v_alignbit_b32 v24, v15, v2, 16
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v41
-; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT:    v_add_f32_e32 v41, 0x40c00000, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v40
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v11
-; SI-NEXT:    v_alignbit_b32 v28, v13, v2, 16
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v54
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v2
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v29
+; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v25
+; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v13
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v21
+; SI-NEXT:    v_alignbit_b32 v26, v27, v2, 16
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v55
 ; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v13
-; SI-NEXT:    v_and_b32_e32 v29, 0xffff0000, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v13
+; SI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v11
 ; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v52
-; SI-NEXT:    v_alignbit_b32 v30, v31, v2, 16
+; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
+; SI-NEXT:    v_alignbit_b32 v22, v23, v2, 16
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v53
 ; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v17
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v11
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v50
-; SI-NEXT:    v_alignbit_b32 v26, v27, v2, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v51
-; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v10
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff0000, v9
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v38
-; SI-NEXT:    v_alignbit_b32 v22, v23, v2, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v39
-; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v9
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v34
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v11
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v10
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v39
+; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
 ; SI-NEXT:    v_alignbit_b32 v18, v19, v2, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v35
-; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v49
+; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
+; SI-NEXT:    v_alignbit_b32 v30, v31, v15, 16
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v6
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v48
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v10
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v7
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v9
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v14
 ; SI-NEXT:    v_alignbit_b32 v14, v15, v2, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v49
-; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v54
+; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v36
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v7
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; SI-NEXT:    v_alignbit_b32 v10, v11, v2, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v37
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v51
 ; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v32
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v6
 ; SI-NEXT:    v_alignbit_b32 v6, v7, v2, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v33
-; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v32
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v9, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v13, v14, v13, 16
-; SI-NEXT:    v_alignbit_b32 v17, v18, v17, 16
-; SI-NEXT:    v_alignbit_b32 v21, v22, v21, 16
-; SI-NEXT:    v_alignbit_b32 v25, v26, v25, 16
-; SI-NEXT:    v_alignbit_b32 v29, v30, v29, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v28
+; SI-NEXT:    v_lshr_b64 v[33:34], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[34:35], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[35:36], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[36:37], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[37:38], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[38:39], v[21:22], 16
+; SI-NEXT:    v_lshr_b64 v[50:51], v[25:26], 16
+; SI-NEXT:    v_lshr_b64 v[48:49], v[29:30], 16
+; SI-NEXT:    v_alignbit_b32 v28, v40, v41, 16
 ; SI-NEXT:  .LBB95_3: ; %end
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -60982,41 +61323,49 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v1, v33
+; SI-NEXT:    v_mov_b32_e32 v5, v34
+; SI-NEXT:    v_mov_b32_e32 v9, v35
+; SI-NEXT:    v_mov_b32_e32 v13, v36
+; SI-NEXT:    v_mov_b32_e32 v17, v37
+; SI-NEXT:    v_mov_b32_e32 v21, v38
+; SI-NEXT:    v_mov_b32_e32 v25, v50
+; SI-NEXT:    v_mov_b32_e32 v29, v48
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB95_4:
 ; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr7
 ; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr10
 ; SI-NEXT:    ; implicit-def: $vgpr11
 ; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr14
 ; SI-NEXT:    ; implicit-def: $vgpr15
 ; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr18
 ; SI-NEXT:    ; implicit-def: $vgpr19
 ; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr22
 ; SI-NEXT:    ; implicit-def: $vgpr23
 ; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr26
 ; SI-NEXT:    ; implicit-def: $vgpr27
 ; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    ; implicit-def: $vgpr14
+; SI-NEXT:    ; implicit-def: $vgpr37
+; SI-NEXT:    ; implicit-def: $vgpr18
+; SI-NEXT:    ; implicit-def: $vgpr38
+; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    ; implicit-def: $vgpr50
+; SI-NEXT:    ; implicit-def: $vgpr26
+; SI-NEXT:    ; implicit-def: $vgpr48
+; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    s_branch .LBB95_2
 ;
 ; VI-LABEL: bitcast_v32bf16_to_v32i16_scalar:
@@ -64240,534 +64589,686 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; SI-NEXT:    buffer_store_dword v37, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v37, s30, 0
-; SI-NEXT:    v_writelane_b32 v37, s31, 1
-; SI-NEXT:    v_writelane_b32 v37, s34, 2
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_writelane_b32 v20, s30, 0
+; SI-NEXT:    v_writelane_b32 v20, s31, 1
+; SI-NEXT:    v_writelane_b32 v20, s34, 2
+; SI-NEXT:    v_writelane_b32 v20, s35, 3
+; SI-NEXT:    v_writelane_b32 v20, s36, 4
+; SI-NEXT:    v_writelane_b32 v20, s37, 5
+; SI-NEXT:    v_writelane_b32 v20, s38, 6
+; SI-NEXT:    v_writelane_b32 v20, s39, 7
+; SI-NEXT:    v_writelane_b32 v20, s48, 8
+; SI-NEXT:    v_writelane_b32 v20, s49, 9
+; SI-NEXT:    v_writelane_b32 v20, s50, 10
+; SI-NEXT:    v_writelane_b32 v20, s51, 11
+; SI-NEXT:    v_writelane_b32 v20, s52, 12
+; SI-NEXT:    v_writelane_b32 v20, s53, 13
+; SI-NEXT:    v_writelane_b32 v20, s54, 14
+; SI-NEXT:    v_writelane_b32 v20, s55, 15
+; SI-NEXT:    v_writelane_b32 v20, s64, 16
+; SI-NEXT:    v_writelane_b32 v20, s65, 17
+; SI-NEXT:    v_writelane_b32 v20, s66, 18
+; SI-NEXT:    v_writelane_b32 v20, s67, 19
+; SI-NEXT:    v_writelane_b32 v20, s68, 20
+; SI-NEXT:    v_writelane_b32 v20, s69, 21
+; SI-NEXT:    v_writelane_b32 v20, s70, 22
+; SI-NEXT:    v_writelane_b32 v20, s71, 23
+; SI-NEXT:    v_writelane_b32 v20, s80, 24
+; SI-NEXT:    v_writelane_b32 v20, s81, 25
+; SI-NEXT:    v_writelane_b32 v20, s82, 26
+; SI-NEXT:    v_writelane_b32 v20, s83, 27
+; SI-NEXT:    v_writelane_b32 v20, s84, 28
+; SI-NEXT:    v_writelane_b32 v20, s85, 29
+; SI-NEXT:    v_writelane_b32 v20, s86, 30
+; SI-NEXT:    v_writelane_b32 v20, s87, 31
+; SI-NEXT:    v_writelane_b32 v20, s96, 32
+; SI-NEXT:    v_writelane_b32 v20, s97, 33
+; SI-NEXT:    v_writelane_b32 v20, s98, 34
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v19
-; SI-NEXT:    v_writelane_b32 v37, s35, 3
-; SI-NEXT:    v_readfirstlane_b32 s34, v18
-; SI-NEXT:    v_readfirstlane_b32 s35, v17
-; SI-NEXT:    v_readfirstlane_b32 s30, v14
-; SI-NEXT:    v_readfirstlane_b32 s31, v13
-; SI-NEXT:    v_readfirstlane_b32 s94, v10
-; SI-NEXT:    v_readfirstlane_b32 s95, v9
-; SI-NEXT:    v_readfirstlane_b32 s92, v6
-; SI-NEXT:    v_readfirstlane_b32 s93, v5
-; SI-NEXT:    v_readfirstlane_b32 s90, v2
-; SI-NEXT:    v_readfirstlane_b32 s91, v1
+; SI-NEXT:    v_writelane_b32 v20, s99, 35
+; SI-NEXT:    s_mov_b32 s93, s18
+; SI-NEXT:    s_mov_b32 s31, s17
+; SI-NEXT:    v_readfirstlane_b32 s59, v18
+; SI-NEXT:    v_readfirstlane_b32 s18, v17
+; SI-NEXT:    v_readfirstlane_b32 s63, v16
+; SI-NEXT:    v_readfirstlane_b32 s17, v15
+; SI-NEXT:    v_readfirstlane_b32 s72, v14
+; SI-NEXT:    v_readfirstlane_b32 s76, v13
+; SI-NEXT:    v_readfirstlane_b32 s57, v12
+; SI-NEXT:    v_readfirstlane_b32 s61, v11
+; SI-NEXT:    v_readfirstlane_b32 s44, v10
+; SI-NEXT:    v_readfirstlane_b32 s58, v9
+; SI-NEXT:    v_readfirstlane_b32 s62, v8
+; SI-NEXT:    v_readfirstlane_b32 s45, v7
+; SI-NEXT:    v_readfirstlane_b32 s96, v6
+; SI-NEXT:    v_readfirstlane_b32 s97, v5
+; SI-NEXT:    v_readfirstlane_b32 s99, v4
+; SI-NEXT:    v_readfirstlane_b32 s46, v3
+; SI-NEXT:    v_readfirstlane_b32 s83, v2
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v36, 16, v16
+; SI-NEXT:    v_readfirstlane_b32 s85, v1
+; SI-NEXT:    ; implicit-def: $vgpr21 : SGPR spill to VGPR lane
 ; SI-NEXT:    s_cbranch_scc0 .LBB97_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s17, 16
+; SI-NEXT:    s_lshl_b32 s5, s31, 16
 ; SI-NEXT:    s_or_b32 s40, s4, s5
-; SI-NEXT:    s_and_b32 s4, s18, 0xffff
+; SI-NEXT:    s_and_b32 s4, s93, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s19, 16
 ; SI-NEXT:    s_or_b32 s41, s4, s5
+; SI-NEXT:    s_lshr_b64 s[4:5], s[40:41], 24
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_writelane_b32 v21, s4, 4
+; SI-NEXT:    v_writelane_b32 v21, s5, 5
+; SI-NEXT:    s_lshr_b64 s[4:5], s[40:41], 16
+; SI-NEXT:    v_writelane_b32 v21, s4, 2
+; SI-NEXT:    v_writelane_b32 v21, s5, 3
+; SI-NEXT:    s_lshr_b64 s[4:5], s[40:41], 8
+; SI-NEXT:    v_writelane_b32 v21, s4, 0
+; SI-NEXT:    v_writelane_b32 v21, s5, 1
 ; SI-NEXT:    s_and_b32 s4, s20, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s21, 16
 ; SI-NEXT:    s_or_b32 s14, s4, s5
 ; SI-NEXT:    s_and_b32 s4, s22, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s23, 16
 ; SI-NEXT:    s_or_b32 s15, s4, s5
+; SI-NEXT:    s_lshr_b64 s[4:5], s[14:15], 24
+; SI-NEXT:    v_writelane_b32 v21, s4, 10
+; SI-NEXT:    v_writelane_b32 v21, s5, 11
+; SI-NEXT:    s_lshr_b64 s[4:5], s[14:15], 16
+; SI-NEXT:    v_writelane_b32 v21, s4, 8
+; SI-NEXT:    v_writelane_b32 v21, s5, 9
+; SI-NEXT:    s_lshr_b64 s[4:5], s[14:15], 8
+; SI-NEXT:    v_writelane_b32 v21, s4, 6
+; SI-NEXT:    v_writelane_b32 v21, s5, 7
 ; SI-NEXT:    s_and_b32 s4, s24, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s25, 16
-; SI-NEXT:    v_mov_b32_e32 v1, s40
-; SI-NEXT:    s_or_b32 s12, s4, s5
+; SI-NEXT:    s_or_b32 s10, s4, s5
 ; SI-NEXT:    s_and_b32 s4, s26, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s27, 16
-; SI-NEXT:    v_alignbit_b32 v18, s41, v1, 24
-; SI-NEXT:    v_alignbit_b32 v25, s41, v1, 16
-; SI-NEXT:    v_alignbit_b32 v30, s41, v1, 8
-; SI-NEXT:    v_mov_b32_e32 v1, s14
-; SI-NEXT:    s_or_b32 s13, s4, s5
+; SI-NEXT:    s_or_b32 s11, s4, s5
+; SI-NEXT:    s_lshr_b64 s[4:5], s[10:11], 24
+; SI-NEXT:    v_writelane_b32 v21, s4, 16
+; SI-NEXT:    v_writelane_b32 v21, s5, 17
+; SI-NEXT:    s_lshr_b64 s[4:5], s[10:11], 16
+; SI-NEXT:    v_writelane_b32 v21, s4, 14
+; SI-NEXT:    v_writelane_b32 v21, s5, 15
+; SI-NEXT:    s_lshr_b64 s[4:5], s[10:11], 8
+; SI-NEXT:    v_writelane_b32 v21, s4, 12
+; SI-NEXT:    v_writelane_b32 v21, s5, 13
 ; SI-NEXT:    s_and_b32 s4, s28, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s29, 16
-; SI-NEXT:    v_alignbit_b32 v19, s15, v1, 24
-; SI-NEXT:    v_alignbit_b32 v26, s15, v1, 16
-; SI-NEXT:    v_alignbit_b32 v31, s15, v1, 8
-; SI-NEXT:    v_mov_b32_e32 v1, s12
-; SI-NEXT:    s_or_b32 s10, s4, s5
-; SI-NEXT:    s_and_b32 s4, s91, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s90, 16
-; SI-NEXT:    v_alignbit_b32 v17, s13, v1, 24
-; SI-NEXT:    v_alignbit_b32 v23, s13, v1, 16
-; SI-NEXT:    v_alignbit_b32 v29, s13, v1, 8
-; SI-NEXT:    s_or_b32 s11, s4, s5
-; SI-NEXT:    v_mov_b32_e32 v1, s10
-; SI-NEXT:    v_alignbit_b32 v16, s11, v1, 24
-; SI-NEXT:    v_alignbit_b32 v20, s11, v1, 16
-; SI-NEXT:    v_alignbit_b32 v27, s11, v1, 8
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; SI-NEXT:    s_and_b32 s4, s93, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s92, 16
-; SI-NEXT:    v_or_b32_e32 v5, v1, v33
-; SI-NEXT:    s_or_b32 s9, s4, s5
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v7
-; SI-NEXT:    s_and_b32 s4, s95, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s94, 16
-; SI-NEXT:    v_or_b32_e32 v4, v1, v34
+; SI-NEXT:    s_or_b32 s42, s4, s5
+; SI-NEXT:    s_and_b32 s4, s85, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s83, 16
+; SI-NEXT:    s_or_b32 s43, s4, s5
+; SI-NEXT:    s_and_b32 s4, s46, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s99, 16
+; SI-NEXT:    s_or_b32 s12, s4, s5
+; SI-NEXT:    s_and_b32 s4, s97, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s96, 16
+; SI-NEXT:    s_or_b32 s13, s4, s5
+; SI-NEXT:    s_and_b32 s4, s45, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s62, 16
 ; SI-NEXT:    s_or_b32 s8, s4, s5
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v11
-; SI-NEXT:    s_and_b32 s4, s31, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s30, 16
-; SI-NEXT:    v_or_b32_e32 v2, v1, v35
-; SI-NEXT:    s_or_b32 s7, s4, s5
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v15
-; SI-NEXT:    s_and_b32 s4, s35, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s34, 16
-; SI-NEXT:    v_or_b32_e32 v1, v1, v36
+; SI-NEXT:    s_and_b32 s4, s58, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s44, 16
+; SI-NEXT:    s_or_b32 s9, s4, s5
+; SI-NEXT:    s_and_b32 s4, s61, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s57, 16
 ; SI-NEXT:    s_or_b32 s6, s4, s5
-; SI-NEXT:    v_alignbit_b32 v9, s9, v5, 24
-; SI-NEXT:    v_alignbit_b32 v12, s9, v5, 16
-; SI-NEXT:    v_alignbit_b32 v21, s9, v5, 8
-; SI-NEXT:    v_alignbit_b32 v6, s8, v4, 24
-; SI-NEXT:    v_alignbit_b32 v8, s8, v4, 16
-; SI-NEXT:    v_alignbit_b32 v13, s8, v4, 8
-; SI-NEXT:    v_alignbit_b32 v24, s7, v2, 24
-; SI-NEXT:    v_alignbit_b32 v28, s7, v2, 16
-; SI-NEXT:    v_alignbit_b32 v32, s7, v2, 8
-; SI-NEXT:    v_alignbit_b32 v10, s6, v1, 24
-; SI-NEXT:    v_alignbit_b32 v14, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v22, s6, v1, 8
-; SI-NEXT:    s_lshr_b32 s78, s41, 8
-; SI-NEXT:    s_lshr_b32 s75, s15, 8
-; SI-NEXT:    s_lshr_b32 s72, s13, 8
-; SI-NEXT:    s_lshr_b32 s61, s11, 8
-; SI-NEXT:    s_lshr_b32 s58, s9, 8
-; SI-NEXT:    s_lshr_b32 s47, s8, 8
-; SI-NEXT:    s_lshr_b32 s45, s7, 8
-; SI-NEXT:    s_lshr_b32 s42, s6, 8
-; SI-NEXT:    s_and_b32 s88, s19, 0xffff
-; SI-NEXT:    s_and_b32 s77, s23, 0xffff
-; SI-NEXT:    s_and_b32 s74, s27, 0xffff
-; SI-NEXT:    s_and_b32 s63, s90, 0xffff
-; SI-NEXT:    s_and_b32 s60, s92, 0xffff
-; SI-NEXT:    s_and_b32 s57, s94, 0xffff
-; SI-NEXT:    s_and_b32 s46, s30, 0xffff
-; SI-NEXT:    s_and_b32 s43, s34, 0xffff
-; SI-NEXT:    s_bfe_u32 s89, s19, 0x80008
-; SI-NEXT:    s_bfe_u32 s79, s23, 0x80008
-; SI-NEXT:    s_bfe_u32 s76, s27, 0x80008
-; SI-NEXT:    s_bfe_u32 s73, s90, 0x80008
-; SI-NEXT:    s_bfe_u32 s62, s92, 0x80008
-; SI-NEXT:    s_bfe_u32 s59, s94, 0x80008
-; SI-NEXT:    s_bfe_u32 s56, s30, 0x80008
-; SI-NEXT:    s_bfe_u32 s44, s34, 0x80008
+; SI-NEXT:    s_and_b32 s4, s76, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s72, 16
+; SI-NEXT:    s_or_b32 s7, s4, s5
+; SI-NEXT:    s_and_b32 s4, s17, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s63, 16
+; SI-NEXT:    s_and_b32 s78, s72, 0xffff
+; SI-NEXT:    s_lshr_b64 s[34:35], s[8:9], 24
+; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    s_and_b32 s5, s18, 0xffff
+; SI-NEXT:    s_lshl_b32 s47, s59, 16
+; SI-NEXT:    s_mov_b32 s35, s78
+; SI-NEXT:    s_mov_b32 s78, s93
+; SI-NEXT:    s_lshr_b64 s[92:93], s[6:7], 24
+; SI-NEXT:    s_or_b32 s5, s5, s47
+; SI-NEXT:    s_lshr_b32 s79, s7, 8
+; SI-NEXT:    s_mov_b32 s93, s78
+; SI-NEXT:    s_lshr_b64 s[94:95], s[6:7], 16
+; SI-NEXT:    s_mov_b32 s78, s31
+; SI-NEXT:    s_lshr_b64 s[30:31], s[6:7], 8
+; SI-NEXT:    s_lshr_b32 s88, s5, 8
+; SI-NEXT:    s_bfe_u32 s89, s72, 0x80008
+; SI-NEXT:    s_lshr_b64 s[36:37], s[8:9], 16
+; SI-NEXT:    s_mov_b32 s95, s79
+; SI-NEXT:    s_mov_b32 s31, s78
+; SI-NEXT:    s_lshr_b64 s[78:79], s[4:5], 24
+; SI-NEXT:    s_and_b32 s90, s59, 0xffff
+; SI-NEXT:    s_mov_b32 s37, s89
+; SI-NEXT:    s_mov_b32 s79, s88
+; SI-NEXT:    s_lshr_b64 s[88:89], s[4:5], 16
+; SI-NEXT:    s_bfe_u32 vcc_lo, s59, 0x80008
+; SI-NEXT:    s_mov_b32 s89, s90
+; SI-NEXT:    s_lshr_b64 s[90:91], s[4:5], 8
+; SI-NEXT:    s_lshr_b32 s60, s41, 8
+; SI-NEXT:    s_lshr_b32 s87, s15, 8
+; SI-NEXT:    s_lshr_b32 s82, s11, 8
+; SI-NEXT:    s_lshr_b32 s71, s43, 8
+; SI-NEXT:    s_lshr_b32 s68, s13, 8
+; SI-NEXT:    s_lshr_b32 s73, s9, 8
+; SI-NEXT:    s_and_b32 s74, s19, 0xffff
+; SI-NEXT:    s_and_b32 s98, s23, 0xffff
+; SI-NEXT:    s_and_b32 s84, s27, 0xffff
+; SI-NEXT:    s_and_b32 s80, s83, 0xffff
+; SI-NEXT:    s_and_b32 s69, s96, 0xffff
+; SI-NEXT:    s_and_b32 s75, s44, 0xffff
+; SI-NEXT:    s_bfe_u32 s47, s19, 0x80008
+; SI-NEXT:    s_bfe_u32 s56, s23, 0x80008
+; SI-NEXT:    s_bfe_u32 s86, s27, 0x80008
+; SI-NEXT:    s_bfe_u32 s81, s83, 0x80008
+; SI-NEXT:    s_bfe_u32 s70, s96, 0x80008
+; SI-NEXT:    s_bfe_u32 s77, s44, 0x80008
+; SI-NEXT:    s_lshr_b64 s[54:55], s[42:43], 24
+; SI-NEXT:    s_lshr_b64 s[64:65], s[42:43], 16
+; SI-NEXT:    s_lshr_b64 s[66:67], s[42:43], 8
+; SI-NEXT:    s_lshr_b64 s[48:49], s[12:13], 24
+; SI-NEXT:    s_lshr_b64 s[50:51], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[52:53], s[12:13], 8
+; SI-NEXT:    s_lshr_b64 s[38:39], s[8:9], 8
+; SI-NEXT:    s_mov_b32 s91, vcc_lo
 ; SI-NEXT:    s_cbranch_execnz .LBB97_3
 ; SI-NEXT:  .LBB97_2: ; %cmp.true
-; SI-NEXT:    s_add_i32 s35, s35, 3
-; SI-NEXT:    s_and_b32 s4, s35, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s34, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s31, s31, 3
-; SI-NEXT:    s_add_i32 s6, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s31, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s30, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s95, s95, 3
-; SI-NEXT:    s_add_i32 s7, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s95, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s94, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s93, s93, 3
-; SI-NEXT:    s_add_i32 s8, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s93, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s92, 16
+; SI-NEXT:    s_add_i32 s17, s17, 3
+; SI-NEXT:    s_and_b32 s4, s17, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s63, 16
+; SI-NEXT:    s_add_i32 s18, s18, 3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    s_and_b32 s5, s18, 0xffff
+; SI-NEXT:    s_lshl_b32 s6, s59, 16
+; SI-NEXT:    s_add_i32 s61, s61, 3
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    s_and_b32 s6, s61, 0xffff
+; SI-NEXT:    s_lshl_b32 s7, s57, 16
+; SI-NEXT:    s_add_i32 s76, s76, 3
+; SI-NEXT:    s_or_b32 s6, s7, s6
+; SI-NEXT:    s_and_b32 s7, s76, 0xffff
+; SI-NEXT:    s_lshl_b32 s8, s72, 16
+; SI-NEXT:    s_add_i32 s45, s45, 3
+; SI-NEXT:    s_or_b32 s7, s8, s7
+; SI-NEXT:    s_and_b32 s8, s45, 0xffff
+; SI-NEXT:    s_lshl_b32 s9, s62, 16
+; SI-NEXT:    s_add_i32 s58, s58, 3
+; SI-NEXT:    s_or_b32 s8, s9, s8
+; SI-NEXT:    s_and_b32 s9, s58, 0xffff
+; SI-NEXT:    s_lshl_b32 s10, s44, 16
+; SI-NEXT:    s_add_i32 s46, s46, 3
+; SI-NEXT:    s_or_b32 s9, s10, s9
+; SI-NEXT:    s_and_b32 s10, s46, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s99, 16
+; SI-NEXT:    s_or_b32 s10, s11, s10
+; SI-NEXT:    s_add_i32 s97, s97, 3
+; SI-NEXT:    s_add_i32 s12, s10, 0x30000
+; SI-NEXT:    s_and_b32 s10, s97, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s96, 16
+; SI-NEXT:    s_or_b32 s10, s11, s10
 ; SI-NEXT:    s_add_i32 s28, s28, 3
-; SI-NEXT:    s_add_i32 s9, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s28, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s29, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s91, s91, 3
-; SI-NEXT:    s_add_i32 s10, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s91, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s90, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    s_add_i32 s13, s10, 0x30000
+; SI-NEXT:    s_and_b32 s10, s28, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s29, 16
+; SI-NEXT:    s_or_b32 s10, s11, s10
+; SI-NEXT:    s_add_i32 s85, s85, 3
+; SI-NEXT:    s_add_i32 s42, s10, 0x30000
+; SI-NEXT:    s_and_b32 s10, s85, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s83, 16
+; SI-NEXT:    s_or_b32 s10, s11, s10
 ; SI-NEXT:    s_add_i32 s24, s24, 3
-; SI-NEXT:    s_add_i32 s11, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s24, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s25, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    s_add_i32 s43, s10, 0x30000
+; SI-NEXT:    s_and_b32 s10, s24, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s25, 16
 ; SI-NEXT:    s_add_i32 s26, s26, 3
-; SI-NEXT:    s_add_i32 s12, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s26, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s27, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    s_or_b32 s10, s11, s10
+; SI-NEXT:    s_and_b32 s11, s26, 0xffff
+; SI-NEXT:    s_lshl_b32 s14, s27, 16
 ; SI-NEXT:    s_add_i32 s20, s20, 3
-; SI-NEXT:    s_add_i32 s13, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s20, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s21, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    s_or_b32 s11, s14, s11
+; SI-NEXT:    s_and_b32 s14, s20, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s21, 16
 ; SI-NEXT:    s_add_i32 s22, s22, 3
-; SI-NEXT:    s_add_i32 s14, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s22, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s23, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    s_or_b32 s14, s15, s14
+; SI-NEXT:    s_and_b32 s15, s22, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s23, 16
 ; SI-NEXT:    s_add_i32 s16, s16, 3
-; SI-NEXT:    s_add_i32 s15, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s16, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s17, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s18, s18, 3
-; SI-NEXT:    s_add_i32 s40, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s18, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s19, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v15
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v11
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 3, v7
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 3, v3
-; SI-NEXT:    s_add_i32 s41, s4, 0x30000
-; SI-NEXT:    v_mov_b32_e32 v6, s40
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_alignbit_b32 v18, s41, v6, 24
-; SI-NEXT:    v_alignbit_b32 v25, s41, v6, 16
-; SI-NEXT:    v_alignbit_b32 v30, s41, v6, 8
-; SI-NEXT:    v_mov_b32_e32 v6, s14
-; SI-NEXT:    v_or_b32_e32 v1, v36, v1
-; SI-NEXT:    v_or_b32_e32 v2, v35, v2
-; SI-NEXT:    v_or_b32_e32 v4, v34, v4
-; SI-NEXT:    v_or_b32_e32 v3, v33, v3
-; SI-NEXT:    v_alignbit_b32 v19, s15, v6, 24
-; SI-NEXT:    v_alignbit_b32 v26, s15, v6, 16
-; SI-NEXT:    v_alignbit_b32 v31, s15, v6, 8
-; SI-NEXT:    v_mov_b32_e32 v6, s12
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x30000, v1
-; SI-NEXT:    v_mov_b32_e32 v15, s6
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x30000, v2
-; SI-NEXT:    v_mov_b32_e32 v10, s7
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x30000, v4
-; SI-NEXT:    v_mov_b32_e32 v7, s8
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x30000, v3
-; SI-NEXT:    v_mov_b32_e32 v3, s9
-; SI-NEXT:    v_alignbit_b32 v17, s13, v6, 24
-; SI-NEXT:    v_alignbit_b32 v23, s13, v6, 16
-; SI-NEXT:    v_alignbit_b32 v29, s13, v6, 8
-; SI-NEXT:    v_mov_b32_e32 v6, s10
-; SI-NEXT:    v_alignbit_b32 v16, s11, v6, 24
-; SI-NEXT:    v_alignbit_b32 v20, s11, v6, 16
-; SI-NEXT:    v_alignbit_b32 v27, s11, v6, 8
-; SI-NEXT:    v_alignbit_b32 v9, v3, v5, 24
-; SI-NEXT:    v_alignbit_b32 v12, v3, v5, 16
-; SI-NEXT:    v_alignbit_b32 v21, v3, v5, 8
-; SI-NEXT:    v_alignbit_b32 v6, v7, v4, 24
-; SI-NEXT:    v_alignbit_b32 v8, v7, v4, 16
-; SI-NEXT:    v_alignbit_b32 v13, v7, v4, 8
-; SI-NEXT:    v_alignbit_b32 v24, v10, v2, 24
-; SI-NEXT:    v_alignbit_b32 v28, v10, v2, 16
-; SI-NEXT:    v_alignbit_b32 v32, v10, v2, 8
-; SI-NEXT:    v_alignbit_b32 v10, v15, v1, 24
-; SI-NEXT:    v_alignbit_b32 v14, v15, v1, 16
-; SI-NEXT:    v_alignbit_b32 v22, v15, v1, 8
-; SI-NEXT:    s_lshr_b32 s89, s41, 24
-; SI-NEXT:    s_lshr_b32 s88, s41, 16
-; SI-NEXT:    s_lshr_b32 s78, s41, 8
-; SI-NEXT:    s_lshr_b32 s79, s15, 24
-; SI-NEXT:    s_lshr_b32 s77, s15, 16
-; SI-NEXT:    s_lshr_b32 s75, s15, 8
-; SI-NEXT:    s_lshr_b32 s76, s13, 24
-; SI-NEXT:    s_lshr_b32 s74, s13, 16
-; SI-NEXT:    s_lshr_b32 s72, s13, 8
-; SI-NEXT:    s_lshr_b32 s73, s11, 24
-; SI-NEXT:    s_lshr_b32 s63, s11, 16
-; SI-NEXT:    s_lshr_b32 s61, s11, 8
-; SI-NEXT:    s_lshr_b32 s62, s9, 24
-; SI-NEXT:    s_lshr_b32 s60, s9, 16
-; SI-NEXT:    s_lshr_b32 s58, s9, 8
-; SI-NEXT:    s_lshr_b32 s59, s8, 24
-; SI-NEXT:    s_lshr_b32 s57, s8, 16
-; SI-NEXT:    s_lshr_b32 s47, s8, 8
-; SI-NEXT:    s_lshr_b32 s56, s7, 24
-; SI-NEXT:    s_lshr_b32 s46, s7, 16
-; SI-NEXT:    s_lshr_b32 s45, s7, 8
-; SI-NEXT:    s_lshr_b32 s44, s6, 24
-; SI-NEXT:    s_lshr_b32 s43, s6, 16
-; SI-NEXT:    s_lshr_b32 s42, s6, 8
+; SI-NEXT:    s_or_b32 s15, s17, s15
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s31, 16
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s40, s16, 0x30000
+; SI-NEXT:    s_add_i32 s16, s93, 3
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s19, 16
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    s_add_i32 s41, s16, 0x30000
+; SI-NEXT:    s_lshr_b64 s[16:17], s[40:41], 24
+; SI-NEXT:    v_writelane_b32 v21, s16, 4
+; SI-NEXT:    v_writelane_b32 v21, s17, 5
+; SI-NEXT:    s_lshr_b64 s[16:17], s[40:41], 16
+; SI-NEXT:    v_writelane_b32 v21, s16, 2
+; SI-NEXT:    v_writelane_b32 v21, s17, 3
+; SI-NEXT:    s_lshr_b64 s[16:17], s[40:41], 8
+; SI-NEXT:    s_add_i32 s14, s14, 0x30000
+; SI-NEXT:    s_add_i32 s15, s15, 0x30000
+; SI-NEXT:    v_writelane_b32 v21, s16, 0
+; SI-NEXT:    v_writelane_b32 v21, s17, 1
+; SI-NEXT:    s_lshr_b64 s[16:17], s[14:15], 24
+; SI-NEXT:    v_writelane_b32 v21, s16, 10
+; SI-NEXT:    v_writelane_b32 v21, s17, 11
+; SI-NEXT:    s_lshr_b64 s[16:17], s[14:15], 16
+; SI-NEXT:    v_writelane_b32 v21, s16, 8
+; SI-NEXT:    v_writelane_b32 v21, s17, 9
+; SI-NEXT:    s_lshr_b64 s[16:17], s[14:15], 8
+; SI-NEXT:    s_add_i32 s10, s10, 0x30000
+; SI-NEXT:    s_add_i32 s11, s11, 0x30000
+; SI-NEXT:    v_writelane_b32 v21, s16, 6
+; SI-NEXT:    v_writelane_b32 v21, s17, 7
+; SI-NEXT:    s_lshr_b64 s[16:17], s[10:11], 24
+; SI-NEXT:    v_writelane_b32 v21, s16, 16
+; SI-NEXT:    v_writelane_b32 v21, s17, 17
+; SI-NEXT:    s_lshr_b64 s[16:17], s[10:11], 16
+; SI-NEXT:    s_add_i32 s4, s4, 0x30000
+; SI-NEXT:    s_add_i32 s5, s5, 0x30000
+; SI-NEXT:    s_add_i32 s6, s6, 0x30000
+; SI-NEXT:    s_add_i32 s7, s7, 0x30000
+; SI-NEXT:    s_add_i32 s8, s8, 0x30000
+; SI-NEXT:    s_add_i32 s9, s9, 0x30000
+; SI-NEXT:    v_writelane_b32 v21, s16, 14
+; SI-NEXT:    v_writelane_b32 v21, s17, 15
+; SI-NEXT:    s_lshr_b64 s[16:17], s[10:11], 8
+; SI-NEXT:    s_lshr_b64 s[34:35], s[8:9], 24
+; SI-NEXT:    s_lshr_b64 s[36:37], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[94:95], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[78:79], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[88:89], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[90:91], s[4:5], 8
+; SI-NEXT:    v_writelane_b32 v21, s16, 12
+; SI-NEXT:    s_lshr_b64 s[54:55], s[42:43], 24
+; SI-NEXT:    s_lshr_b64 s[64:65], s[42:43], 16
+; SI-NEXT:    s_lshr_b64 s[66:67], s[42:43], 8
+; SI-NEXT:    s_lshr_b64 s[48:49], s[12:13], 24
+; SI-NEXT:    s_lshr_b64 s[50:51], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[52:53], s[12:13], 8
+; SI-NEXT:    s_lshr_b64 s[38:39], s[8:9], 8
+; SI-NEXT:    s_lshr_b64 s[92:93], s[6:7], 24
+; SI-NEXT:    s_lshr_b64 s[30:31], s[6:7], 8
+; SI-NEXT:    s_lshr_b32 s47, s41, 24
+; SI-NEXT:    s_lshr_b32 s74, s41, 16
+; SI-NEXT:    s_lshr_b32 s60, s41, 8
+; SI-NEXT:    s_lshr_b32 s56, s15, 24
+; SI-NEXT:    s_lshr_b32 s98, s15, 16
+; SI-NEXT:    s_lshr_b32 s87, s15, 8
+; SI-NEXT:    s_lshr_b32 s86, s11, 24
+; SI-NEXT:    s_lshr_b32 s84, s11, 16
+; SI-NEXT:    s_lshr_b32 s82, s11, 8
+; SI-NEXT:    s_lshr_b32 s81, s43, 24
+; SI-NEXT:    s_lshr_b32 s80, s43, 16
+; SI-NEXT:    s_lshr_b32 s71, s43, 8
+; SI-NEXT:    s_lshr_b32 s70, s13, 24
+; SI-NEXT:    s_lshr_b32 s69, s13, 16
+; SI-NEXT:    s_lshr_b32 s68, s13, 8
+; SI-NEXT:    s_lshr_b32 s77, s9, 24
+; SI-NEXT:    s_lshr_b32 s75, s9, 16
+; SI-NEXT:    s_lshr_b32 s73, s9, 8
+; SI-NEXT:    s_lshr_b32 s37, s7, 24
+; SI-NEXT:    s_lshr_b32 s35, s7, 16
+; SI-NEXT:    s_lshr_b32 s95, s7, 8
+; SI-NEXT:    s_lshr_b32 s91, s5, 24
+; SI-NEXT:    s_lshr_b32 s89, s5, 16
+; SI-NEXT:    s_lshr_b32 s79, s5, 8
+; SI-NEXT:    v_writelane_b32 v21, s17, 13
 ; SI-NEXT:  .LBB97_3: ; %end
-; SI-NEXT:    s_and_b32 s4, s40, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v30
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s41, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s78, 8
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v25
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s88, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v18
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s16, s89, 24
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v7, v11, v7
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s16, s5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v7
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 4, v0
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
-; SI-NEXT:    s_and_b32 s4, s14, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v31
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s15, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s75, 8
+; SI-NEXT:    v_readlane_b32 s18, v21, 0
+; SI-NEXT:    v_readlane_b32 s19, v21, 1
+; SI-NEXT:    s_lshl_b32 s17, s18, 8
+; SI-NEXT:    v_readlane_b32 s18, v21, 2
+; SI-NEXT:    s_and_b32 s16, s40, 0xff
+; SI-NEXT:    v_readlane_b32 s19, v21, 3
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    s_and_b32 s17, s18, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v21, 4
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_lshl_b32 s18, s18, 24
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v1, s16
+; SI-NEXT:    s_and_b32 s16, s41, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s60, 8
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    s_and_b32 s17, s74, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    s_lshl_b32 s18, s47, 24
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    v_readlane_b32 s16, v21, 6
+; SI-NEXT:    s_and_b32 s14, s14, 0xff
+; SI-NEXT:    v_readlane_b32 s17, v21, 7
+; SI-NEXT:    s_lshl_b32 s16, s16, 8
+; SI-NEXT:    v_readlane_b32 s19, v21, 5
+; SI-NEXT:    s_or_b32 s14, s14, s16
+; SI-NEXT:    v_readlane_b32 s16, v21, 8
+; SI-NEXT:    v_readlane_b32 s17, v21, 9
+; SI-NEXT:    s_and_b32 s16, s16, 0xff
+; SI-NEXT:    v_readlane_b32 s18, v21, 10
+; SI-NEXT:    s_lshl_b32 s16, s16, 16
+; SI-NEXT:    s_lshl_b32 s17, s18, 24
+; SI-NEXT:    s_and_b32 s14, s14, 0xffff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v26
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s77, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v19
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s14, s79, 24
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v7, v11, v7
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s14, s5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 8, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v7, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; SI-NEXT:    s_or_b32 s14, s14, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
-; SI-NEXT:    s_and_b32 s4, s12, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v29
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s13, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s72, 8
+; SI-NEXT:    v_mov_b32_e32 v2, s14
+; SI-NEXT:    s_and_b32 s14, s15, 0xff
+; SI-NEXT:    s_lshl_b32 s15, s87, 8
+; SI-NEXT:    s_or_b32 s14, s14, s15
+; SI-NEXT:    s_and_b32 s15, s98, 0xff
+; SI-NEXT:    s_lshl_b32 s15, s15, 16
+; SI-NEXT:    s_lshl_b32 s16, s56, 24
+; SI-NEXT:    s_and_b32 s14, s14, 0xffff
+; SI-NEXT:    s_or_b32 s15, s16, s15
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; SI-NEXT:    s_or_b32 s14, s14, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s14
+; SI-NEXT:    v_readlane_b32 s14, v21, 12
+; SI-NEXT:    s_and_b32 s10, s10, 0xff
+; SI-NEXT:    v_readlane_b32 s15, v21, 13
+; SI-NEXT:    s_lshl_b32 s14, s14, 8
+; SI-NEXT:    s_or_b32 s10, s10, s14
+; SI-NEXT:    v_readlane_b32 s14, v21, 14
+; SI-NEXT:    v_readlane_b32 s15, v21, 15
+; SI-NEXT:    s_and_b32 s14, s14, 0xff
+; SI-NEXT:    v_readlane_b32 s16, v21, 16
+; SI-NEXT:    s_lshl_b32 s14, s14, 16
+; SI-NEXT:    s_lshl_b32 s15, s16, 24
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_or_b32 s14, s15, s14
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; SI-NEXT:    s_or_b32 s10, s10, s14
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v23
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s74, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v17
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s12, s76, 24
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v7, v11, v7
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s12, s5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 16, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v7, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s10, s11, 0xff
+; SI-NEXT:    s_lshl_b32 s11, s82, 8
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    s_and_b32 s11, s84, 0xff
+; SI-NEXT:    s_lshl_b32 s11, s11, 16
+; SI-NEXT:    s_lshl_b32 s14, s86, 24
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_or_b32 s11, s14, s11
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
-; SI-NEXT:    s_and_b32 s4, s10, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v27
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s11, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s61, 8
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s10, s42, 0xff
+; SI-NEXT:    s_lshl_b32 s11, s66, 8
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    s_and_b32 s11, s64, 0xff
+; SI-NEXT:    s_lshl_b32 s11, s11, 16
+; SI-NEXT:    s_lshl_b32 s14, s54, 24
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_or_b32 s11, s14, s11
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 20, v0
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v20
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s63, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v16
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s10, s73, 24
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v7, v11, v7
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s10, s5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 24, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v7, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s10, s43, 0xff
+; SI-NEXT:    s_lshl_b32 s11, s71, 8
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    s_and_b32 s11, s80, 0xff
+; SI-NEXT:    s_lshl_b32 s11, s11, 16
+; SI-NEXT:    s_lshl_b32 s14, s81, 24
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_or_b32 s11, s14, s11
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 24, v0
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v21
-; SI-NEXT:    s_and_b32 s4, s9, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s58, 8
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v12
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s60, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s10, s12, 0xff
+; SI-NEXT:    s_lshl_b32 s11, s52, 8
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    s_and_b32 s11, s50, 0xff
+; SI-NEXT:    s_lshl_b32 s11, s11, 16
+; SI-NEXT:    s_lshl_b32 s12, s48, 24
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_or_b32 s11, s12, s11
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v9
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s9, s62, 24
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v5, v7, v5
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s9, s5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 32, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s10, s13, 0xff
+; SI-NEXT:    s_lshl_b32 s11, s68, 8
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    s_and_b32 s11, s69, 0xff
+; SI-NEXT:    s_lshl_b32 s11, s11, 16
+; SI-NEXT:    s_lshl_b32 s12, s70, 24
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_or_b32 s11, s12, s11
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v0
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s8, s8, 0xff
+; SI-NEXT:    s_lshl_b32 s10, s38, 8
+; SI-NEXT:    s_or_b32 s8, s8, s10
+; SI-NEXT:    s_and_b32 s10, s36, 0xff
+; SI-NEXT:    s_lshl_b32 s10, s10, 16
+; SI-NEXT:    s_lshl_b32 s11, s34, 24
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    s_or_b32 s10, s11, s10
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 36, v0
+; SI-NEXT:    s_or_b32 s8, s8, s10
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 36, v0
-; SI-NEXT:    v_mov_b32_e32 v5, s4
-; SI-NEXT:    buffer_store_dword v5, v3, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v13
-; SI-NEXT:    s_and_b32 s4, s8, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s47, 8
-; SI-NEXT:    v_or_b32_e32 v3, v3, v4
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v8
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s57, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s8, s9, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s73, 8
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    s_and_b32 s9, s75, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_lshl_b32 s10, s77, 24
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    s_or_b32 s9, s10, s9
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 40, v0
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s8, s59, 24
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v4, v5, v4
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s8, s5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v4
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 40, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s6, s6, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s30, 8
+; SI-NEXT:    s_or_b32 s6, s6, s8
+; SI-NEXT:    s_and_b32 s8, s94, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_lshl_b32 s9, s92, 24
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_or_b32 s8, s9, s8
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 44, v0
+; SI-NEXT:    s_or_b32 s6, s6, s8
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 44, v0
-; SI-NEXT:    v_mov_b32_e32 v4, s4
-; SI-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v32
-; SI-NEXT:    s_and_b32 s4, s7, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s45, 8
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v28
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s46, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s6, s7, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s95, 8
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    s_and_b32 s7, s35, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_lshl_b32 s8, s37, 24
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_or_b32 s7, s8, s7
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 48, v0
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v24
-; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s7, s56, 24
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_or_b32_e32 v3, v4, v3
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    s_lshl_b32 s6, s90, 8
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    s_and_b32 s6, s88, 0xff
+; SI-NEXT:    s_lshl_b32 s6, s6, 16
+; SI-NEXT:    s_lshl_b32 s7, s78, 24
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s7, s5
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    s_or_b32 s6, s7, s6
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 52, v0
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 52, v0
-; SI-NEXT:    v_mov_b32_e32 v3, s4
-; SI-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v22
-; SI-NEXT:    s_and_b32 s4, s6, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s42, 8
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v14
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    s_and_b32 s4, s5, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s79, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s43, 0xff
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v10
+; SI-NEXT:    s_and_b32 s5, s89, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s6, s44, 24
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    s_lshl_b32 s6, s91, 24
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s6, s5
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 56, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:    v_readlane_b32 s19, v21, 11
+; SI-NEXT:    v_readlane_b32 s17, v21, 17
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    v_readlane_b32 s35, v37, 3
-; SI-NEXT:    v_readlane_b32 s34, v37, 2
-; SI-NEXT:    v_readlane_b32 s31, v37, 1
-; SI-NEXT:    v_readlane_b32 s30, v37, 0
+; SI-NEXT:    v_readlane_b32 s99, v20, 35
+; SI-NEXT:    v_readlane_b32 s98, v20, 34
+; SI-NEXT:    v_readlane_b32 s97, v20, 33
+; SI-NEXT:    v_readlane_b32 s96, v20, 32
+; SI-NEXT:    v_readlane_b32 s87, v20, 31
+; SI-NEXT:    v_readlane_b32 s86, v20, 30
+; SI-NEXT:    v_readlane_b32 s85, v20, 29
+; SI-NEXT:    v_readlane_b32 s84, v20, 28
+; SI-NEXT:    v_readlane_b32 s83, v20, 27
+; SI-NEXT:    v_readlane_b32 s82, v20, 26
+; SI-NEXT:    v_readlane_b32 s81, v20, 25
+; SI-NEXT:    v_readlane_b32 s80, v20, 24
+; SI-NEXT:    v_readlane_b32 s71, v20, 23
+; SI-NEXT:    v_readlane_b32 s70, v20, 22
+; SI-NEXT:    v_readlane_b32 s69, v20, 21
+; SI-NEXT:    v_readlane_b32 s68, v20, 20
+; SI-NEXT:    v_readlane_b32 s67, v20, 19
+; SI-NEXT:    v_readlane_b32 s66, v20, 18
+; SI-NEXT:    v_readlane_b32 s65, v20, 17
+; SI-NEXT:    v_readlane_b32 s64, v20, 16
+; SI-NEXT:    v_readlane_b32 s55, v20, 15
+; SI-NEXT:    v_readlane_b32 s54, v20, 14
+; SI-NEXT:    v_readlane_b32 s53, v20, 13
+; SI-NEXT:    v_readlane_b32 s52, v20, 12
+; SI-NEXT:    v_readlane_b32 s51, v20, 11
+; SI-NEXT:    v_readlane_b32 s50, v20, 10
+; SI-NEXT:    v_readlane_b32 s49, v20, 9
+; SI-NEXT:    v_readlane_b32 s48, v20, 8
+; SI-NEXT:    v_readlane_b32 s39, v20, 7
+; SI-NEXT:    v_readlane_b32 s38, v20, 6
+; SI-NEXT:    v_readlane_b32 s37, v20, 5
+; SI-NEXT:    v_readlane_b32 s36, v20, 4
+; SI-NEXT:    v_readlane_b32 s35, v20, 3
+; SI-NEXT:    v_readlane_b32 s34, v20, 2
+; SI-NEXT:    v_readlane_b32 s31, v20, 1
+; SI-NEXT:    v_readlane_b32 s30, v20, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB97_4:
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_writelane_b32 v21, s4, 0
+; SI-NEXT:    v_writelane_b32 v21, s5, 1
+; SI-NEXT:    ; implicit-def: $sgpr4
 ; SI-NEXT:    ; implicit-def: $sgpr40
-; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $sgpr41
-; SI-NEXT:    ; implicit-def: $sgpr78
-; SI-NEXT:    ; implicit-def: $sgpr88
-; SI-NEXT:    ; implicit-def: $sgpr89
+; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr74
+; SI-NEXT:    ; implicit-def: $sgpr47
 ; SI-NEXT:    ; implicit-def: $sgpr14
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $sgpr15
+; SI-NEXT:    ; implicit-def: $sgpr87
+; SI-NEXT:    ; implicit-def: $sgpr98
+; SI-NEXT:    ; implicit-def: $sgpr56
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr82
+; SI-NEXT:    ; implicit-def: $sgpr84
+; SI-NEXT:    ; implicit-def: $sgpr86
+; SI-NEXT:    ; implicit-def: $sgpr71
+; SI-NEXT:    ; implicit-def: $sgpr80
+; SI-NEXT:    ; implicit-def: $sgpr81
+; SI-NEXT:    ; implicit-def: $sgpr68
+; SI-NEXT:    ; implicit-def: $sgpr69
+; SI-NEXT:    ; implicit-def: $sgpr70
+; SI-NEXT:    ; implicit-def: $sgpr73
 ; SI-NEXT:    ; implicit-def: $sgpr75
 ; SI-NEXT:    ; implicit-def: $sgpr77
+; SI-NEXT:    ; implicit-def: $sgpr95
+; SI-NEXT:    ; implicit-def: $sgpr35
+; SI-NEXT:    ; implicit-def: $sgpr37
 ; SI-NEXT:    ; implicit-def: $sgpr79
+; SI-NEXT:    ; implicit-def: $sgpr89
+; SI-NEXT:    ; implicit-def: $sgpr91
+; SI-NEXT:    ; implicit-def: $sgpr42
+; SI-NEXT:    ; implicit-def: $sgpr66
+; SI-NEXT:    ; implicit-def: $sgpr64
+; SI-NEXT:    ; implicit-def: $sgpr54
 ; SI-NEXT:    ; implicit-def: $sgpr12
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $sgpr13
-; SI-NEXT:    ; implicit-def: $sgpr72
-; SI-NEXT:    ; implicit-def: $sgpr74
-; SI-NEXT:    ; implicit-def: $sgpr76
-; SI-NEXT:    ; implicit-def: $sgpr10
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $sgpr11
-; SI-NEXT:    ; implicit-def: $sgpr61
-; SI-NEXT:    ; implicit-def: $sgpr63
-; SI-NEXT:    ; implicit-def: $sgpr73
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $sgpr58
-; SI-NEXT:    ; implicit-def: $sgpr60
-; SI-NEXT:    ; implicit-def: $sgpr62
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    ; implicit-def: $sgpr52
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; implicit-def: $sgpr48
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $sgpr47
-; SI-NEXT:    ; implicit-def: $sgpr57
-; SI-NEXT:    ; implicit-def: $sgpr59
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $sgpr45
-; SI-NEXT:    ; implicit-def: $sgpr46
-; SI-NEXT:    ; implicit-def: $sgpr56
+; SI-NEXT:    ; implicit-def: $sgpr38
+; SI-NEXT:    ; implicit-def: $sgpr36
+; SI-NEXT:    ; implicit-def: $sgpr34
 ; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $sgpr42
-; SI-NEXT:    ; implicit-def: $sgpr43
-; SI-NEXT:    ; implicit-def: $sgpr44
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $vgpr10
+; SI-NEXT:    ; implicit-def: $sgpr30
+; SI-NEXT:    ; implicit-def: $sgpr94
+; SI-NEXT:    ; implicit-def: $sgpr92
+; SI-NEXT:    ; implicit-def: $sgpr90
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    ; implicit-def: $sgpr78
+; SI-NEXT:    v_writelane_b32 v21, s4, 2
+; SI-NEXT:    v_writelane_b32 v21, s5, 3
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v21, s4, 4
+; SI-NEXT:    v_writelane_b32 v21, s5, 5
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v21, s4, 6
+; SI-NEXT:    v_writelane_b32 v21, s5, 7
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v21, s4, 8
+; SI-NEXT:    v_writelane_b32 v21, s5, 9
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v21, s4, 10
+; SI-NEXT:    v_writelane_b32 v21, s5, 11
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v21, s4, 12
+; SI-NEXT:    v_writelane_b32 v21, s5, 13
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v21, s4, 14
+; SI-NEXT:    v_writelane_b32 v21, s5, 15
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    v_writelane_b32 v21, s4, 16
+; SI-NEXT:    v_writelane_b32 v21, s5, 17
+; SI-NEXT:    ; implicit-def: $sgpr4
 ; SI-NEXT:    s_branch .LBB97_2
 ;
 ; VI-LABEL: bitcast_v32i16_to_v64i8_scalar:
@@ -68563,433 +69064,541 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v46, v30
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:76
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:32
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:24
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:40
-; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:36
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:60
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:56
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:24
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:72
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:68
-; SI-NEXT:    v_readfirstlane_b32 s15, v27
-; SI-NEXT:    v_readfirstlane_b32 s40, v26
-; SI-NEXT:    v_readfirstlane_b32 s12, v19
-; SI-NEXT:    v_readfirstlane_b32 s13, v18
-; SI-NEXT:    v_readfirstlane_b32 s10, v11
-; SI-NEXT:    v_readfirstlane_b32 s11, v10
-; SI-NEXT:    v_readfirstlane_b32 s8, v3
-; SI-NEXT:    v_readfirstlane_b32 s9, v2
-; SI-NEXT:    v_readfirstlane_b32 s7, v1
-; SI-NEXT:    v_readfirstlane_b32 s6, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 24, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v49, 8, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v51, 24, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v53, 8, v23
-; SI-NEXT:    v_lshlrev_b32_e32 v52, 24, v25
-; SI-NEXT:    v_lshlrev_b32_e32 v54, 24, v21
-; SI-NEXT:    v_lshlrev_b32_e32 v44, 24, v29
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:56
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:72
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:68
+; SI-NEXT:    v_readfirstlane_b32 s43, v1
+; SI-NEXT:    v_readfirstlane_b32 s42, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v42, 8, v3
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 24, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v40, 8, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v53, 24, v13
+; SI-NEXT:    v_lshlrev_b32_e32 v43, 8, v19
+; SI-NEXT:    v_lshlrev_b32_e32 v52, 24, v21
+; SI-NEXT:    v_lshlrev_b32_e32 v59, 8, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v29
+; SI-NEXT:    v_lshlrev_b32_e32 v47, 8, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v45, 24, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v57, 8, v15
+; SI-NEXT:    v_lshlrev_b32_e32 v56, 24, v17
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v23
+; SI-NEXT:    v_lshlrev_b32_e32 v41, 24, v25
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_readfirstlane_b32 s44, v31
-; SI-NEXT:    v_readfirstlane_b32 s45, v32
-; SI-NEXT:    v_readfirstlane_b32 s42, v33
-; SI-NEXT:    v_readfirstlane_b32 s43, v34
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v35
-; SI-NEXT:    v_lshlrev_b32_e32 v43, 8, v36
-; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_lshlrev_b32_e32 v42, 24, v37
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 8, v38
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_lshlrev_b32_e32 v56, 8, v38
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 24, v36
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 8, v48
+; SI-NEXT:    s_waitcnt vmcnt(12)
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v39
+; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    v_lshlrev_b32_e32 v61, 8, v37
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_lshlrev_b32_e32 v58, 24, v39
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v49
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_lshlrev_b32_e32 v47, 24, v48
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 8, v30
 ; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_lshlrev_b32_e32 v60, 8, v59
+; SI-NEXT:    v_lshlrev_b32_e32 v30, 24, v31
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_lshlrev_b32_e32 v61, 24, v61
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_lshlrev_b32_e32 v59, 24, v62
+; SI-NEXT:    v_lshlrev_b32_e32 v38, 8, v33
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v29, 24, v34
+; SI-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB99_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_and_b32 s4, s20, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s21, 8
+; SI-NEXT:    v_and_b32_e32 v0, 0xff, v2
+; SI-NEXT:    v_or_b32_e32 v0, v0, v42
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xff, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_or_b32_e32 v0, v18, v0
+; SI-NEXT:    v_or_b32_e32 v37, v1, v0
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v10
+; SI-NEXT:    v_and_b32_e32 v9, 0xff, v12
+; SI-NEXT:    v_or_b32_e32 v1, v1, v40
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(10) expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v60, v44
+; SI-NEXT:    v_or_b32_e32 v44, v53, v9
+; SI-NEXT:    v_or_b32_e32 v33, v1, v44
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v9, 0xff, v20
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT:    v_mov_b32_e32 v15, v46
+; SI-NEXT:    v_or_b32_e32 v46, v52, v9
+; SI-NEXT:    v_and_b32_e32 v9, 0xff, v28
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT:    v_or_b32_e32 v55, v3, v9
+; SI-NEXT:    v_and_b32_e32 v9, 0xff, v63
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v27, v13
+; SI-NEXT:    v_mov_b32_e32 v58, v8
+; SI-NEXT:    v_mov_b32_e32 v49, v45
+; SI-NEXT:    v_mov_b32_e32 v36, v24
+; SI-NEXT:    v_mov_b32_e32 v34, v26
+; SI-NEXT:    s_and_b32 s4, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s17, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s22, 0xff
+; SI-NEXT:    s_and_b32 s5, s18, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s14, s23, 24
+; SI-NEXT:    s_lshl_b32 s6, s19, 24
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s14, s5
-; SI-NEXT:    s_or_b32 s41, s4, s5
-; SI-NEXT:    s_and_b32 s4, s18, 0xff
-; SI-NEXT:    s_lshl_b32 s4, s4, 16
-; SI-NEXT:    s_lshl_b32 s14, s19, 24
-; SI-NEXT:    s_or_b32 s4, s14, s4
-; SI-NEXT:    s_and_b32 s14, s28, 0xff
-; SI-NEXT:    s_lshl_b32 s46, s29, 8
-; SI-NEXT:    s_or_b32 s14, s14, s46
-; SI-NEXT:    s_and_b32 s46, s6, 0xff
-; SI-NEXT:    s_lshl_b32 s46, s46, 16
-; SI-NEXT:    s_lshl_b32 s47, s7, 24
-; SI-NEXT:    s_or_b32 s57, s47, s46
-; SI-NEXT:    s_and_b32 s46, s26, 0xff
-; SI-NEXT:    s_lshl_b32 s46, s46, 16
-; SI-NEXT:    s_lshl_b32 s47, s27, 24
-; SI-NEXT:    s_or_b32 s46, s47, s46
-; SI-NEXT:    s_and_b32 s47, s16, 0xff
-; SI-NEXT:    s_lshl_b32 s56, s17, 8
-; SI-NEXT:    s_or_b32 s47, s47, s56
-; SI-NEXT:    s_and_b32 s47, s47, 0xffff
-; SI-NEXT:    v_mov_b32_e32 v1, s4
-; SI-NEXT:    v_and_b32_e32 v9, 0xff, v6
-; SI-NEXT:    v_and_b32_e32 v10, 0xff, v8
-; SI-NEXT:    s_or_b32 s47, s47, s4
+; SI-NEXT:    s_or_b32 s12, s6, s5
+; SI-NEXT:    s_or_b32 s6, s4, s12
 ; SI-NEXT:    s_and_b32 s4, s24, 0xff
-; SI-NEXT:    s_lshl_b32 s56, s25, 8
-; SI-NEXT:    v_or_b32_e32 v9, v9, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT:    s_or_b32 s4, s4, s56
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; SI-NEXT:    v_or_b32_e32 v11, v2, v10
+; SI-NEXT:    s_lshl_b32 s5, s25, 8
+; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    s_and_b32 s5, s26, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    s_lshl_b32 s7, s27, 24
+; SI-NEXT:    s_or_b32 s14, s7, s5
+; SI-NEXT:    s_and_b32 s5, s20, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s21, 8
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    s_and_b32 s7, s5, 0xffff
+; SI-NEXT:    s_and_b32 s5, s22, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    s_lshl_b32 s8, s23, 24
+; SI-NEXT:    s_or_b32 s5, s8, s5
+; SI-NEXT:    s_or_b32 s13, s7, s5
+; SI-NEXT:    s_lshr_b64 s[8:9], s[12:13], 16
+; SI-NEXT:    s_and_b32 s7, s28, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s29, 8
+; SI-NEXT:    s_or_b32 s7, s7, s9
+; SI-NEXT:    s_and_b32 s9, s7, 0xffff
+; SI-NEXT:    s_and_b32 s7, s42, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_lshl_b32 s10, s43, 24
+; SI-NEXT:    s_or_b32 s7, s10, s7
+; SI-NEXT:    s_or_b32 s15, s9, s7
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_mov_b32_e32 v5, s46
-; SI-NEXT:    v_or_b32_e32 v10, v9, v11
-; SI-NEXT:    v_and_b32_e32 v9, 0xff, v4
-; SI-NEXT:    v_and_b32_e32 v13, 0xff, v14
-; SI-NEXT:    v_and_b32_e32 v17, 0xff, v16
-; SI-NEXT:    s_or_b32 s46, s4, s46
-; SI-NEXT:    s_and_b32 s4, s9, 0xff
-; SI-NEXT:    s_lshl_b32 s56, s8, 8
+; SI-NEXT:    s_lshr_b64 s[10:11], s[14:15], 16
+; SI-NEXT:    s_or_b32 s4, s4, s14
+; SI-NEXT:    v_mov_b32_e32 v39, v32
+; SI-NEXT:    s_lshr_b32 s9, s5, 16
+; SI-NEXT:    s_lshr_b32 s11, s7, 16
+; SI-NEXT:    s_mov_b32 s7, s13
+; SI-NEXT:    s_mov_b32 s5, s15
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v43
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v48, v1, v46
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v26
+; SI-NEXT:    v_or_b32_e32 v1, v1, v59
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v35, v1, v55
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v50
+; SI-NEXT:    v_or_b32_e32 v1, v1, v17
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_or_b32_e32 v43, v13, v9
+; SI-NEXT:    v_or_b32_e32 v50, v1, v43
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v54
+; SI-NEXT:    v_and_b32_e32 v9, 0xff, v51
+; SI-NEXT:    v_or_b32_e32 v1, v1, v19
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_or_b32_e32 v13, v13, v49
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT:    s_or_b32 s4, s4, s56
-; SI-NEXT:    v_or_b32_e32 v15, v3, v9
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; SI-NEXT:    v_or_b32_e32 v19, v7, v17
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v36, v13, v19
-; SI-NEXT:    v_and_b32_e32 v13, 0xff, v12
-; SI-NEXT:    v_and_b32_e32 v17, 0xff, v22
-; SI-NEXT:    v_and_b32_e32 v18, 0xff, v24
-; SI-NEXT:    v_and_b32_e32 v32, 0xff, v55
-; SI-NEXT:    v_or_b32_e32 v35, s4, v15
-; SI-NEXT:    s_and_b32 s4, s11, 0xff
-; SI-NEXT:    s_lshl_b32 s56, s10, 8
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_or_b32_e32 v17, v17, v53
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
-; SI-NEXT:    s_or_b32 s4, s4, s56
-; SI-NEXT:    v_or_b32_e32 v23, v51, v13
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; SI-NEXT:    v_or_b32_e32 v27, v52, v18
-; SI-NEXT:    v_or_b32_e32 v62, v47, v32
-; SI-NEXT:    v_and_b32_e32 v32, 0xff, v41
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v18, v17, v27
-; SI-NEXT:    v_and_b32_e32 v17, 0xff, v20
-; SI-NEXT:    v_and_b32_e32 v21, 0xff, v30
-; SI-NEXT:    v_and_b32_e32 v26, 0xff, v50
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
-; SI-NEXT:    v_or_b32_e32 v37, s4, v23
-; SI-NEXT:    s_and_b32 s4, s13, 0xff
-; SI-NEXT:    s_lshl_b32 s56, s12, 8
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT:    v_or_b32_e32 v21, v21, v43
-; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; SI-NEXT:    v_or_b32_e32 v33, v58, v32
-; SI-NEXT:    v_and_b32_e32 v32, 0xff, v45
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v34, 0xff, v46
-; SI-NEXT:    s_or_b32 s4, s4, s56
-; SI-NEXT:    v_or_b32_e32 v25, v54, v17
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; SI-NEXT:    v_or_b32_e32 v31, v42, v26
-; SI-NEXT:    v_or_b32_e32 v32, v32, v60
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v38, v21, v31
-; SI-NEXT:    v_and_b32_e32 v21, 0xff, v28
-; SI-NEXT:    v_and_b32_e32 v32, 0xffff, v32
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_mov_b32_e32 v13, v4
+; SI-NEXT:    v_mov_b32_e32 v4, v2
+; SI-NEXT:    v_mov_b32_e32 v2, v59
+; SI-NEXT:    v_mov_b32_e32 v59, v3
+; SI-NEXT:    v_mov_b32_e32 v3, v63
+; SI-NEXT:    v_mov_b32_e32 v63, v40
+; SI-NEXT:    v_mov_b32_e32 v40, v42
+; SI-NEXT:    v_or_b32_e32 v42, v11, v9
+; SI-NEXT:    v_or_b32_e32 v54, v1, v42
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v6
+; SI-NEXT:    v_and_b32_e32 v9, 0xff, v8
+; SI-NEXT:    v_or_b32_e32 v1, v1, v47
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v11, v45, v9
+; SI-NEXT:    v_or_b32_e32 v1, v1, v11
+; SI-NEXT:    v_mov_b32_e32 v19, v10
+; SI-NEXT:    v_lshr_b64 v[9:10], v[0:1], 16
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    v_and_b32_e32 v0, 0xff, v14
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_or_b32_e32 v63, v59, v34
-; SI-NEXT:    v_or_b32_e32 v39, s4, v25
-; SI-NEXT:    s_and_b32 s4, s40, 0xff
-; SI-NEXT:    s_lshl_b32 s56, s15, 8
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; SI-NEXT:    v_or_b32_e32 v48, v32, v63
-; SI-NEXT:    v_and_b32_e32 v32, 0xff, v57
-; SI-NEXT:    s_or_b32 s4, s4, s56
-; SI-NEXT:    v_or_b32_e32 v29, v44, v21
-; SI-NEXT:    v_and_b32_e32 v26, 0xff, v40
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v26, v26, v56
-; SI-NEXT:    v_or_b32_e32 v34, v61, v32
-; SI-NEXT:    v_or_b32_e32 v32, s4, v29
-; SI-NEXT:    s_and_b32 s4, s43, 0xff
-; SI-NEXT:    s_lshl_b32 s56, s42, 8
-; SI-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; SI-NEXT:    s_or_b32 s4, s4, s56
-; SI-NEXT:    v_or_b32_e32 v26, v26, v62
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_alignbit_b32 v17, v18, v25, 16
-; SI-NEXT:    v_alignbit_b32 v25, v26, v33, 16
-; SI-NEXT:    v_or_b32_e32 v33, s4, v33
-; SI-NEXT:    s_and_b32 s4, s45, 0xff
-; SI-NEXT:    s_lshl_b32 s56, s44, 8
-; SI-NEXT:    s_and_b32 s14, s14, 0xffff
-; SI-NEXT:    s_or_b32 s4, s4, s56
-; SI-NEXT:    s_or_b32 s14, s14, s57
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_alignbit_b32 v1, s41, v1, 16
-; SI-NEXT:    v_alignbit_b32 v5, s14, v5, 16
-; SI-NEXT:    v_alignbit_b32 v9, v10, v15, 16
-; SI-NEXT:    v_alignbit_b32 v13, v36, v23, 16
-; SI-NEXT:    v_alignbit_b32 v21, v38, v29, 16
-; SI-NEXT:    v_alignbit_b32 v29, v48, v34, 16
-; SI-NEXT:    v_or_b32_e32 v34, s4, v34
-; SI-NEXT:    s_lshr_b32 s56, s5, 16
-; SI-NEXT:    s_lshr_b32 s57, s57, 16
+; SI-NEXT:    v_and_b32_e32 v10, 0xff, v16
+; SI-NEXT:    v_or_b32_e32 v0, v0, v57
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_or_b32_e32 v23, v56, v10
+; SI-NEXT:    v_mov_b32_e32 v8, v6
+; SI-NEXT:    v_mov_b32_e32 v6, v14
+; SI-NEXT:    v_or_b32_e32 v45, v0, v23
+; SI-NEXT:    v_and_b32_e32 v0, 0xff, v22
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v24
+; SI-NEXT:    v_lshr_b64 v[9:10], v[44:45], 16
+; SI-NEXT:    v_or_b32_e32 v0, v0, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_or_b32_e32 v14, v41, v14
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v24, v17
+; SI-NEXT:    v_mov_b32_e32 v17, v47
+; SI-NEXT:    v_or_b32_e32 v47, v0, v14
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[9:10], v[46:47], 16
+; SI-NEXT:    v_mov_b32_e32 v46, v15
+; SI-NEXT:    v_and_b32_e32 v0, 0xff, v46
+; SI-NEXT:    v_and_b32_e32 v15, 0xff, v62
+; SI-NEXT:    v_or_b32_e32 v0, v0, v61
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_or_b32_e32 v31, v7, v15
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v9, v61
+; SI-NEXT:    v_mov_b32_e32 v61, v7
+; SI-NEXT:    v_mov_b32_e32 v7, v5
+; SI-NEXT:    v_mov_b32_e32 v5, v52
+; SI-NEXT:    v_mov_b32_e32 v52, v41
+; SI-NEXT:    v_mov_b32_e32 v41, v62
+; SI-NEXT:    v_mov_b32_e32 v62, v57
+; SI-NEXT:    v_mov_b32_e32 v57, v53
+; SI-NEXT:    v_mov_b32_e32 v53, v56
+; SI-NEXT:    v_or_b32_e32 v56, v0, v31
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v51, v22
+; SI-NEXT:    v_lshr_b64 v[21:22], v[55:56], 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v27
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v15, 0xff, v10
+; SI-NEXT:    v_or_b32_e32 v0, v0, v25
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_or_b32_e32 v22, v30, v15
+; SI-NEXT:    v_or_b32_e32 v44, v0, v22
+; SI-NEXT:    v_and_b32_e32 v0, 0xff, v32
+; SI-NEXT:    v_and_b32_e32 v15, 0xff, v60
+; SI-NEXT:    v_or_b32_e32 v0, v0, v38
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_or_b32_e32 v38, v29, v15
+; SI-NEXT:    v_lshr_b64 v[25:26], v[43:44], 16
+; SI-NEXT:    v_or_b32_e32 v43, v0, v38
+; SI-NEXT:    v_mov_b32_e32 v0, v30
+; SI-NEXT:    v_lshr_b64 v[29:30], v[42:43], 16
+; SI-NEXT:    v_mov_b32_e32 v42, v40
+; SI-NEXT:    v_mov_b32_e32 v40, v63
+; SI-NEXT:    v_mov_b32_e32 v63, v3
+; SI-NEXT:    v_mov_b32_e32 v3, v59
+; SI-NEXT:    v_mov_b32_e32 v59, v2
+; SI-NEXT:    v_mov_b32_e32 v10, v19
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:    v_mov_b32_e32 v4, v13
+; SI-NEXT:    v_mov_b32_e32 v13, v27
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v22
+; SI-NEXT:    v_mov_b32_e32 v14, v6
+; SI-NEXT:    v_mov_b32_e32 v6, v8
+; SI-NEXT:    v_mov_b32_e32 v8, v58
+; SI-NEXT:    v_mov_b32_e32 v22, v51
+; SI-NEXT:    v_mov_b32_e32 v51, v44
+; SI-NEXT:    v_mov_b32_e32 v44, v60
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v55, v43
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v30, v0
+; SI-NEXT:    v_mov_b32_e32 v26, v34
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v23
 ; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v31
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v62
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v63
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v38
+; SI-NEXT:    v_mov_b32_e32 v38, v1
+; SI-NEXT:    v_mov_b32_e32 v34, v45
+; SI-NEXT:    v_mov_b32_e32 v45, v49
+; SI-NEXT:    v_mov_b32_e32 v49, v47
+; SI-NEXT:    v_mov_b32_e32 v47, v17
+; SI-NEXT:    v_mov_b32_e32 v17, v24
+; SI-NEXT:    v_mov_b32_e32 v24, v36
+; SI-NEXT:    v_mov_b32_e32 v36, v56
+; SI-NEXT:    v_mov_b32_e32 v56, v53
+; SI-NEXT:    v_mov_b32_e32 v53, v57
+; SI-NEXT:    v_mov_b32_e32 v57, v62
+; SI-NEXT:    v_mov_b32_e32 v62, v41
+; SI-NEXT:    v_mov_b32_e32 v41, v52
+; SI-NEXT:    v_mov_b32_e32 v52, v5
+; SI-NEXT:    v_mov_b32_e32 v5, v7
+; SI-NEXT:    v_mov_b32_e32 v7, v61
+; SI-NEXT:    v_mov_b32_e32 v61, v9
 ; SI-NEXT:    s_cbranch_execnz .LBB99_3
 ; SI-NEXT:  .LBB99_2: ; %cmp.true
-; SI-NEXT:    s_add_i32 s45, s45, 3
-; SI-NEXT:    s_and_b32 s4, s45, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s44, 8
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v57
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; SI-NEXT:    s_add_i32 s24, s24, 3
+; SI-NEXT:    s_and_b32 s4, s24, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s25, 8
+; SI-NEXT:    s_add_i32 s26, s26, 3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    s_and_b32 s6, s26, 0xff
 ; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    s_lshl_b32 s5, s27, 24
+; SI-NEXT:    s_lshl_b32 s6, s6, 16
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v1, v61, v1
-; SI-NEXT:    v_or_b32_e32 v1, s4, v1
-; SI-NEXT:    v_add_i32_e32 v34, vcc, 0x3000000, v1
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v45
+; SI-NEXT:    s_or_b32 s5, s5, s6
+; SI-NEXT:    s_add_i32 s28, s28, 3
+; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    s_and_b32 s5, s28, 0xff
+; SI-NEXT:    s_lshl_b32 s6, s29, 8
+; SI-NEXT:    s_add_i32 s42, s42, 3
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    s_and_b32 s7, s42, 0xff
+; SI-NEXT:    s_addk_i32 s5, 0x300
+; SI-NEXT:    s_lshl_b32 s6, s43, 24
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    s_and_b32 s6, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s17, 8
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_or_b32 s6, s7, s6
+; SI-NEXT:    s_and_b32 s8, s18, 0xff
+; SI-NEXT:    s_addk_i32 s6, 0x300
+; SI-NEXT:    s_lshl_b32 s7, s19, 24
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_or_b32 s7, s7, s8
+; SI-NEXT:    s_add_i32 s20, s20, 3
+; SI-NEXT:    s_or_b32 s6, s7, s6
+; SI-NEXT:    s_and_b32 s7, s20, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s21, 8
+; SI-NEXT:    s_add_i32 s22, s22, 3
+; SI-NEXT:    s_or_b32 s7, s8, s7
+; SI-NEXT:    s_and_b32 s9, s22, 0xff
+; SI-NEXT:    s_addk_i32 s7, 0x300
+; SI-NEXT:    s_lshl_b32 s8, s23, 24
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
+; SI-NEXT:    s_and_b32 s7, s7, 0xffff
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    s_or_b32 s7, s8, s7
+; SI-NEXT:    s_add_i32 s4, s4, 0x3000000
+; SI-NEXT:    s_add_i32 s5, s5, 0x3000000
+; SI-NEXT:    s_add_i32 s6, s6, 0x3000000
+; SI-NEXT:    s_add_i32 s7, s7, 0x3000000
+; SI-NEXT:    s_lshr_b64 s[8:9], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[4:5], 16
+; SI-NEXT:    s_lshr_b32 s9, s7, 16
+; SI-NEXT:    s_lshr_b32 s11, s5, 16
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x300, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v1
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v9, v1
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v54, vcc, 0x3000000, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v39
+; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v44
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x300, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v46
-; SI-NEXT:    v_or_b32_e32 v1, v60, v1
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x300, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v5, v59, v5
-; SI-NEXT:    v_or_b32_e32 v1, v5, v1
-; SI-NEXT:    s_add_i32 s43, s43, 3
-; SI-NEXT:    v_add_i32_e32 v48, vcc, 0x3000000, v1
-; SI-NEXT:    s_and_b32 s4, s43, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s42, 8
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v41
-; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    v_or_b32_e32 v1, v9, v1
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    v_add_i32_e32 v55, vcc, 0x3000000, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v63
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    s_addk_i32 s4, 0x300
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v1, v58, v1
-; SI-NEXT:    v_or_b32_e32 v1, s4, v1
-; SI-NEXT:    v_add_i32_e32 v33, vcc, 0x3000000, v1
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v40
+; SI-NEXT:    v_or_b32_e32 v1, v13, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v55
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; SI-NEXT:    v_or_b32_e32 v0, v17, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x300, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v50, vcc, 0x3000000, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v60
+; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v58
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v55
-; SI-NEXT:    v_or_b32_e32 v1, v56, v1
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x300, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v5, v47, v5
-; SI-NEXT:    v_or_b32_e32 v1, v5, v1
-; SI-NEXT:    s_add_i32 s40, s40, 3
-; SI-NEXT:    v_add_i32_e32 v26, vcc, 0x3000000, v1
-; SI-NEXT:    s_and_b32 s4, s40, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s15, 8
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x300, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_or_b32_e32 v1, v30, v1
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    v_add_i32_e32 v51, vcc, 0x3000000, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v26
+; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v28
-; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    v_or_b32_e32 v0, v59, v0
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    s_addk_i32 s4, 0x300
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x300, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v1, v44, v1
-; SI-NEXT:    v_or_b32_e32 v1, s4, v1
-; SI-NEXT:    v_add_i32_e32 v32, vcc, 0x3000000, v1
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v30
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    v_add_i32_e32 v35, vcc, 0x3000000, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v46
+; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v62
+; SI-NEXT:    v_or_b32_e32 v0, v61, v0
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v50
-; SI-NEXT:    v_or_b32_e32 v1, v43, v1
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x300, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v5, v42, v5
-; SI-NEXT:    v_or_b32_e32 v1, v5, v1
-; SI-NEXT:    s_add_i32 s13, s13, 3
-; SI-NEXT:    v_add_i32_e32 v38, vcc, 0x3000000, v1
-; SI-NEXT:    s_and_b32 s4, s13, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s12, 8
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x300, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_or_b32_e32 v1, v7, v1
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    v_add_i32_e32 v36, vcc, 0x3000000, v0
+; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v20
-; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    s_addk_i32 s4, 0x300
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v1, v54, v1
-; SI-NEXT:    v_or_b32_e32 v1, s4, v1
-; SI-NEXT:    v_add_i32_e32 v39, vcc, 0x3000000, v1
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v22
+; SI-NEXT:    v_or_b32_e32 v1, v52, v1
+; SI-NEXT:    v_lshr_b64 v[25:26], v[50:51], 16
+; SI-NEXT:    v_lshr_b64 v[29:30], v[54:55], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v36
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v51
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; SI-NEXT:    v_or_b32_e32 v0, v43, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x300, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    v_add_i32_e32 v48, vcc, 0x3000000, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v22
+; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v24
+; SI-NEXT:    v_or_b32_e32 v0, v5, v0
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v24
-; SI-NEXT:    v_or_b32_e32 v1, v53, v1
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x300, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v5, v52, v5
-; SI-NEXT:    v_or_b32_e32 v1, v5, v1
-; SI-NEXT:    s_add_i32 s11, s11, 3
-; SI-NEXT:    v_add_i32_e32 v18, vcc, 0x3000000, v1
-; SI-NEXT:    s_and_b32 s4, s11, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s10, 8
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x300, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_or_b32_e32 v1, v41, v1
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    v_add_i32_e32 v49, vcc, 0x3000000, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v10
+; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v12
-; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    v_or_b32_e32 v0, v40, v0
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    s_addk_i32 s4, 0x300
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x300, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v1, v51, v1
-; SI-NEXT:    v_or_b32_e32 v1, s4, v1
-; SI-NEXT:    v_add_i32_e32 v37, vcc, 0x3000000, v1
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v14
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_or_b32_e32 v1, v53, v1
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    v_add_i32_e32 v33, vcc, 0x3000000, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v14
+; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v16
+; SI-NEXT:    v_or_b32_e32 v0, v57, v0
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 3, v16
-; SI-NEXT:    v_or_b32_e32 v1, v49, v1
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x300, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_or_b32_e32 v5, v7, v5
-; SI-NEXT:    v_or_b32_e32 v1, v5, v1
-; SI-NEXT:    s_add_i32 s9, s9, 3
-; SI-NEXT:    v_add_i32_e32 v36, vcc, 0x3000000, v1
-; SI-NEXT:    s_and_b32 s4, s9, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s8, 8
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x300, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_or_b32_e32 v1, v56, v1
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    v_add_i32_e32 v34, vcc, 0x3000000, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v2
+; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v4
-; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    v_or_b32_e32 v0, v42, v0
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    s_addk_i32 s4, 0x300
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x300, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    v_or_b32_e32 v1, v3, v1
-; SI-NEXT:    s_add_i32 s24, s24, 3
-; SI-NEXT:    v_or_b32_e32 v1, s4, v1
-; SI-NEXT:    s_and_b32 s4, s24, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s25, 8
-; SI-NEXT:    s_add_i32 s26, s26, 3
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_and_b32 s8, s26, 0xff
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s27, 24
-; SI-NEXT:    s_lshl_b32 s8, s8, 16
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s8
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s28, s28, 3
-; SI-NEXT:    s_add_i32 s46, s4, 0x3000000
-; SI-NEXT:    s_and_b32 s4, s28, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s29, 8
-; SI-NEXT:    s_add_i32 s6, s6, 3
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_and_b32 s6, s6, 0xff
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s7, 24
-; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s6
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s16, s16, 3
-; SI-NEXT:    s_add_i32 s14, s4, 0x3000000
-; SI-NEXT:    s_and_b32 s4, s16, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s17, 8
-; SI-NEXT:    s_add_i32 s18, s18, 3
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_and_b32 s6, s18, 0xff
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s19, 24
-; SI-NEXT:    s_lshl_b32 s6, s6, 16
-; SI-NEXT:    v_add_i32_e32 v35, vcc, 0x3000000, v1
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v6
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s6
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s20, s20, 3
-; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_or_b32_e32 v1, v18, v1
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    v_add_i32_e32 v37, vcc, 0x3000000, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v6
+; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; SI-NEXT:    v_add_i32_e32 v1, vcc, 3, v8
-; SI-NEXT:    s_add_i32 s47, s4, 0x3000000
-; SI-NEXT:    s_and_b32 s4, s20, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s21, 8
-; SI-NEXT:    s_add_i32 s22, s22, 3
+; SI-NEXT:    v_or_b32_e32 v0, v47, v0
 ; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_and_b32 s6, s22, 0xff
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x300, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s23, 24
-; SI-NEXT:    s_lshl_b32 s6, s6, 16
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s6
+; SI-NEXT:    v_or_b32_e32 v1, v45, v1
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 0x3000000, v0
-; SI-NEXT:    s_add_i32 s41, s4, 0x3000000
-; SI-NEXT:    v_mov_b32_e32 v0, s47
-; SI-NEXT:    v_alignbit_b32 v1, s41, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s46
-; SI-NEXT:    v_alignbit_b32 v5, s14, v0, 16
-; SI-NEXT:    v_alignbit_b32 v9, v10, v35, 16
-; SI-NEXT:    v_alignbit_b32 v13, v36, v37, 16
-; SI-NEXT:    v_alignbit_b32 v17, v18, v39, 16
-; SI-NEXT:    v_alignbit_b32 v21, v38, v32, 16
-; SI-NEXT:    v_alignbit_b32 v25, v26, v33, 16
-; SI-NEXT:    v_alignbit_b32 v29, v48, v34, 16
-; SI-NEXT:    s_lshr_b32 s56, s41, 16
-; SI-NEXT:    s_lshr_b32 s57, s14, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v36
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v38
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v48
-; SI-NEXT:  .LBB99_3: ; %end
+; SI-NEXT:    v_add_i32_e32 v38, vcc, 0x3000000, v0
+; SI-NEXT:    v_lshr_b64 v[0:1], v[37:38], 16
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[0:1], v[33:34], 16
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[0:1], v[48:49], 16
+; SI-NEXT:    v_lshr_b64 v[21:22], v[35:36], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v38
+; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v34
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v49
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:  .LBB99_3: ; %end
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
@@ -69006,55 +69615,62 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32
 ; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    v_mov_b32_e32 v0, s47
-; SI-NEXT:    v_mov_b32_e32 v2, s41
-; SI-NEXT:    v_mov_b32_e32 v3, s56
-; SI-NEXT:    v_mov_b32_e32 v4, s46
-; SI-NEXT:    v_mov_b32_e32 v6, s14
-; SI-NEXT:    v_mov_b32_e32 v7, s57
-; SI-NEXT:    v_mov_b32_e32 v8, v35
-; SI-NEXT:    v_mov_b32_e32 v12, v37
-; SI-NEXT:    v_mov_b32_e32 v14, v36
-; SI-NEXT:    v_mov_b32_e32 v16, v39
-; SI-NEXT:    v_mov_b32_e32 v20, v32
-; SI-NEXT:    v_mov_b32_e32 v22, v38
-; SI-NEXT:    v_mov_b32_e32 v24, v33
-; SI-NEXT:    v_mov_b32_e32 v28, v34
-; SI-NEXT:    v_mov_b32_e32 v30, v48
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v1, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    v_mov_b32_e32 v3, s9
+; SI-NEXT:    v_mov_b32_e32 v4, s4
+; SI-NEXT:    v_mov_b32_e32 v5, s10
+; SI-NEXT:    v_mov_b32_e32 v6, s5
+; SI-NEXT:    v_mov_b32_e32 v7, s11
+; SI-NEXT:    v_mov_b32_e32 v8, v37
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_mov_b32_e32 v10, v38
+; SI-NEXT:    v_mov_b32_e32 v12, v33
+; SI-NEXT:    v_mov_b32_e32 v14, v34
+; SI-NEXT:    v_mov_b32_e32 v16, v48
+; SI-NEXT:    v_mov_b32_e32 v18, v49
+; SI-NEXT:    v_mov_b32_e32 v20, v35
+; SI-NEXT:    v_mov_b32_e32 v22, v36
+; SI-NEXT:    v_mov_b32_e32 v24, v50
+; SI-NEXT:    v_mov_b32_e32 v26, v51
+; SI-NEXT:    v_mov_b32_e32 v28, v54
+; SI-NEXT:    v_mov_b32_e32 v30, v55
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB99_4:
-; SI-NEXT:    ; implicit-def: $sgpr47
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr41
-; SI-NEXT:    ; implicit-def: $sgpr56
-; SI-NEXT:    ; implicit-def: $sgpr46
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $sgpr14
-; SI-NEXT:    ; implicit-def: $sgpr57
-; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    v_mov_b32_e32 v39, v32
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr9
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr11
 ; SI-NEXT:    ; implicit-def: $vgpr37
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr18
+; SI-NEXT:    ; implicit-def: $vgpr48
+; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
 ; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr38
 ; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr26
 ; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr34
+; SI-NEXT:    ; implicit-def: $vgpr54
 ; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    s_branch .LBB99_2
 ;
@@ -76266,716 +76882,611 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32
 ; SI-LABEL: bitcast_v32f16_to_v64i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f16_f32_e32 v22, s17
+; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_writelane_b32 v40, s30, 0
+; SI-NEXT:    v_writelane_b32 v40, s31, 1
+; SI-NEXT:    v_cvt_f16_f32_e32 v21, s17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v28, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v27, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v22, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v25, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v6
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_cvt_f16_f32_e32 v62, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v23, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v28, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v27, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v30, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v26, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v33, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v41, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v40, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v29, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v38, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v37, v15
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v55, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, s16
+; SI-NEXT:    v_cvt_f16_f32_e32 v35, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, s16
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, s19
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, s18
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, s21
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, s20
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, s20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s23
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, s22
-; SI-NEXT:    v_cvt_f16_f32_e32 v46, s25
-; SI-NEXT:    v_cvt_f16_f32_e32 v42, s24
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, s22
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, s25
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, s24
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, s27
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, s26
-; SI-NEXT:    v_cvt_f16_f32_e32 v26, s29
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, s28
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, s26
+; SI-NEXT:    v_cvt_f16_f32_e32 v17, s29
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, s28
+; SI-NEXT:    v_writelane_b32 v40, s34, 2
+; SI-NEXT:    v_writelane_b32 v40, s35, 3
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v19
+; SI-NEXT:    v_writelane_b32 v40, s36, 4
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    v_writelane_b32 v40, s37, 5
 ; SI-NEXT:    s_cbranch_scc0 .LBB105_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v22
-; SI-NEXT:    v_or_b32_e32 v37, v10, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
-; SI-NEXT:    v_or_b32_e32 v32, v9, v8
-; SI-NEXT:    v_alignbit_b32 v8, v32, v37, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v32, v37, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v32, v37, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
-; SI-NEXT:    v_or_b32_e32 v24, v12, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; SI-NEXT:    v_or_b32_e32 v23, v11, v8
-; SI-NEXT:    v_alignbit_b32 v8, v23, v24, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v23, v24, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v23, v24, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v46
-; SI-NEXT:    v_or_b32_e32 v18, v42, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
-; SI-NEXT:    v_or_b32_e32 v19, v14, v8
-; SI-NEXT:    v_alignbit_b32 v8, v19, v18, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v19, v18, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v19, v18, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v26
-; SI-NEXT:    v_or_b32_e32 v16, v25, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v20
-; SI-NEXT:    v_or_b32_e32 v17, v28, v8
-; SI-NEXT:    v_alignbit_b32 v8, v17, v16, 24
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v17, v16, 16
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v8, v17, v16, 8
-; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v27
-; SI-NEXT:    v_or_b32_e32 v15, v21, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v4
-; SI-NEXT:    v_or_b32_e32 v14, v62, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v35
-; SI-NEXT:    v_or_b32_e32 v12, v34, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
-; SI-NEXT:    v_or_b32_e32 v13, v30, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v53
-; SI-NEXT:    v_or_b32_e32 v10, v50, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; SI-NEXT:    v_or_b32_e32 v11, v48, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v41
-; SI-NEXT:    v_or_b32_e32 v9, v40, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; SI-NEXT:    v_alignbit_b32 v22, v11, v10, 24
-; SI-NEXT:    v_or_b32_e32 v8, v55, v8
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v22, v11, v10, 16
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v22, v8, v9, 24
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v22, v8, v9, 16
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v22, v8, v9, 8
-; SI-NEXT:    v_alignbit_b32 v57, v14, v15, 24
-; SI-NEXT:    v_alignbit_b32 v58, v14, v15, 16
-; SI-NEXT:    v_alignbit_b32 v61, v14, v15, 8
-; SI-NEXT:    v_alignbit_b32 v44, v13, v12, 24
-; SI-NEXT:    v_alignbit_b32 v47, v13, v12, 16
-; SI-NEXT:    v_alignbit_b32 v56, v13, v12, 8
-; SI-NEXT:    v_alignbit_b32 v43, v11, v10, 8
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v52, 8, v32
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 8, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 8, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 8, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 8, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v63, 8, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 8, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 8, v8
-; SI-NEXT:    v_bfe_u32 v54, v7, 8, 8
-; SI-NEXT:    v_bfe_u32 v51, v6, 8, 8
-; SI-NEXT:    v_bfe_u32 v49, v5, 8, 8
-; SI-NEXT:    v_bfe_u32 v38, v20, 8, 8
-; SI-NEXT:    v_bfe_u32 v33, v4, 8, 8
-; SI-NEXT:    v_bfe_u32 v29, v3, 8, 8
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_bfe_u32 v22, v2, 8, 8
-; SI-NEXT:    v_bfe_u32 v60, v1, 8, 8
+; SI-NEXT:    v_readfirstlane_b32 s4, v21
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v9
+; SI-NEXT:    s_or_b32 s18, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v7
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v8
+; SI-NEXT:    s_or_b32 s19, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v12
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v11
+; SI-NEXT:    s_or_b32 s16, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v6
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v10
+; SI-NEXT:    s_or_b32 s17, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v15
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v14
+; SI-NEXT:    s_or_b32 s14, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v5
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v13
+; SI-NEXT:    s_or_b32 s15, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v17
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v16
+; SI-NEXT:    s_or_b32 s12, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v20
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v22
+; SI-NEXT:    s_or_b32 s13, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v25
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v24
+; SI-NEXT:    s_or_b32 s10, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v4
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v23
+; SI-NEXT:    s_or_b32 s11, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v28
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v27
+; SI-NEXT:    s_or_b32 s8, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v3
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v26
+; SI-NEXT:    s_or_b32 s9, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v33
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v31
+; SI-NEXT:    s_or_b32 s6, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v2
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v29
+; SI-NEXT:    s_or_b32 s7, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s4, v38
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s5, v37
+; SI-NEXT:    s_lshr_b64 s[20:21], s[18:19], 24
+; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s5, v1
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    v_readfirstlane_b32 s21, v35
+; SI-NEXT:    s_lshr_b64 s[22:23], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[26:27], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[24:25], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[28:29], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[16:17], 8
+; SI-NEXT:    s_lshr_b64 s[40:41], s[14:15], 24
+; SI-NEXT:    s_lshr_b64 s[44:45], s[14:15], 16
+; SI-NEXT:    s_or_b32 s5, s21, s5
+; SI-NEXT:    s_lshr_b64 s[56:57], s[14:15], 8
+; SI-NEXT:    s_lshr_b64 s[46:47], s[12:13], 24
+; SI-NEXT:    s_lshr_b64 s[58:59], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[12:13], 8
+; SI-NEXT:    s_lshr_b64 s[60:61], s[10:11], 24
+; SI-NEXT:    s_lshr_b64 s[72:73], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[10:11], 8
+; SI-NEXT:    s_lshr_b64 s[74:75], s[8:9], 24
+; SI-NEXT:    s_lshr_b64 s[78:79], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[90:91], s[8:9], 8
+; SI-NEXT:    s_lshr_b64 s[88:89], s[6:7], 24
+; SI-NEXT:    s_lshr_b64 s[92:93], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[94:95], s[6:7], 8
+; SI-NEXT:    s_lshr_b64 s[30:31], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[34:35], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[36:37], s[4:5], 8
+; SI-NEXT:    s_lshr_b32 s45, s19, 8
+; SI-NEXT:    s_lshr_b32 s43, s17, 8
+; SI-NEXT:    s_lshr_b32 s41, s15, 8
+; SI-NEXT:    s_lshr_b32 s29, s13, 8
+; SI-NEXT:    s_lshr_b32 s27, s11, 8
+; SI-NEXT:    s_lshr_b32 s25, s9, 8
+; SI-NEXT:    s_lshr_b32 s23, s7, 8
+; SI-NEXT:    s_lshr_b32 s21, s5, 8
+; SI-NEXT:    v_bfe_u32 v48, v7, 8, 8
+; SI-NEXT:    v_bfe_u32 v39, v6, 8, 8
+; SI-NEXT:    v_bfe_u32 v36, v5, 8, 8
+; SI-NEXT:    v_bfe_u32 v34, v20, 8, 8
+; SI-NEXT:    v_bfe_u32 v32, v4, 8, 8
+; SI-NEXT:    v_bfe_u32 v30, v3, 8, 8
+; SI-NEXT:    v_bfe_u32 v19, v2, 8, 8
+; SI-NEXT:    v_bfe_u32 v18, v1, 8, 8
 ; SI-NEXT:    s_cbranch_execnz .LBB105_3
 ; SI-NEXT:  .LBB105_2: ; %cmp.true
-; SI-NEXT:    v_cvt_f32_f16_e32 v18, v20
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v41
-; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v40
+; SI-NEXT:    v_cvt_f32_f16_e32 v18, v38
+; SI-NEXT:    v_cvt_f32_f16_e32 v19, v37
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v20, v18
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v55
-; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT:    v_readfirstlane_b32 s4, v18
+; SI-NEXT:    v_cvt_f32_f16_e32 v18, v35
+; SI-NEXT:    v_readfirstlane_b32 s5, v19
+; SI-NEXT:    v_cvt_f32_f16_e32 v19, v33
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_or_b32_e32 v9, v9, v8
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v53
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; SI-NEXT:    v_or_b32_e32 v8, v10, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v50
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v48
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v35
-; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
+; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    v_readfirstlane_b32 s6, v18
+; SI-NEXT:    v_cvt_f32_f16_e32 v18, v31
+; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    v_readfirstlane_b32 s5, v1
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    v_readfirstlane_b32 s6, v19
+; SI-NEXT:    v_cvt_f32_f16_e32 v19, v28
+; SI-NEXT:    v_readfirstlane_b32 s7, v18
+; SI-NEXT:    v_cvt_f32_f16_e32 v18, v29
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
-; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, v34
-; SI-NEXT:    v_or_b32_e32 v10, v10, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
+; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; SI-NEXT:    s_lshl_b32 s6, s6, 16
+; SI-NEXT:    s_or_b32 s6, s7, s6
+; SI-NEXT:    v_readfirstlane_b32 s8, v18
+; SI-NEXT:    v_cvt_f32_f16_e32 v18, v27
+; SI-NEXT:    v_readfirstlane_b32 s7, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_or_b32_e32 v11, v12, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v30
-; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    s_lshl_b32 s7, s7, 16
+; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; SI-NEXT:    s_or_b32 s7, s8, s7
+; SI-NEXT:    v_readfirstlane_b32 s8, v19
+; SI-NEXT:    v_cvt_f32_f16_e32 v19, v25
+; SI-NEXT:    v_readfirstlane_b32 s9, v18
+; SI-NEXT:    v_cvt_f32_f16_e32 v18, v26
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; SI-NEXT:    v_cvt_f32_f16_e32 v16, v21
-; SI-NEXT:    v_or_b32_e32 v12, v14, v12
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, v27
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    v_or_b32_e32 v13, v13, v15
-; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v16
-; SI-NEXT:    v_cvt_f32_f16_e32 v16, v62
-; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; SI-NEXT:    v_readfirstlane_b32 s10, v18
+; SI-NEXT:    v_cvt_f32_f16_e32 v18, v24
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
 ; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; SI-NEXT:    s_or_b32 s8, s9, s8
+; SI-NEXT:    v_readfirstlane_b32 s9, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT:    v_or_b32_e32 v15, v15, v14
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
-; SI-NEXT:    v_or_b32_e32 v14, v16, v14
-; SI-NEXT:    v_cvt_f32_f16_e32 v16, v25
-; SI-NEXT:    v_cvt_f32_f16_e32 v19, v46
+; SI-NEXT:    v_readfirstlane_b32 s11, v18
+; SI-NEXT:    v_cvt_f32_f16_e32 v18, v23
+; SI-NEXT:    s_lshl_b32 s9, s9, 16
 ; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
+; SI-NEXT:    s_or_b32 s9, s10, s9
+; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; SI-NEXT:    v_readfirstlane_b32 s10, v19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
 ; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
+; SI-NEXT:    s_lshl_b32 s10, s10, 16
 ; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT:    v_or_b32_e32 v16, v16, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v20
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v42
+; SI-NEXT:    s_or_b32 s10, s11, s10
+; SI-NEXT:    v_readfirstlane_b32 s11, v4
+; SI-NEXT:    s_lshl_b32 s11, s11, 16
+; SI-NEXT:    v_readfirstlane_b32 s12, v18
+; SI-NEXT:    s_or_b32 s11, s12, s11
+; SI-NEXT:    v_readfirstlane_b32 s12, v17
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v20
+; SI-NEXT:    v_readfirstlane_b32 s13, v16
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v22
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, v17
+; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    s_lshl_b32 s12, s12, 16
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    s_or_b32 s12, s13, s12
+; SI-NEXT:    v_readfirstlane_b32 s13, v20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v5
+; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT:    s_lshl_b32 s13, s13, 16
+; SI-NEXT:    v_readfirstlane_b32 s14, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
+; SI-NEXT:    v_readfirstlane_b32 s17, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v21
+; SI-NEXT:    s_or_b32 s13, s14, s13
+; SI-NEXT:    v_readfirstlane_b32 s14, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT:    s_lshl_b32 s14, s14, 16
+; SI-NEXT:    v_readfirstlane_b32 s15, v14
 ; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
-; SI-NEXT:    v_or_b32_e32 v17, v18, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v19
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
-; SI-NEXT:    v_or_b32_e32 v18, v21, v18
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    s_or_b32 s14, s15, s14
+; SI-NEXT:    v_readfirstlane_b32 s15, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    s_lshl_b32 s15, s15, 16
+; SI-NEXT:    v_readfirstlane_b32 s16, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
+; SI-NEXT:    s_or_b32 s15, s16, s15
+; SI-NEXT:    v_readfirstlane_b32 s16, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
+; SI-NEXT:    s_lshl_b32 s16, s16, 16
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
 ; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_readfirstlane_b32 s17, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-NEXT:    v_alignbit_b32 v57, v14, v15, 24
-; SI-NEXT:    v_alignbit_b32 v58, v14, v15, 16
-; SI-NEXT:    v_alignbit_b32 v61, v14, v15, 8
-; SI-NEXT:    v_alignbit_b32 v44, v13, v12, 24
-; SI-NEXT:    v_alignbit_b32 v47, v13, v12, 16
-; SI-NEXT:    v_alignbit_b32 v56, v13, v12, 8
-; SI-NEXT:    v_alignbit_b32 v43, v11, v10, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 8, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 8, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v63, 8, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 8, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 8, v8
-; SI-NEXT:    v_bfe_u32 v54, v7, 8, 8
-; SI-NEXT:    v_bfe_u32 v51, v6, 8, 8
-; SI-NEXT:    v_bfe_u32 v49, v5, 8, 8
-; SI-NEXT:    v_bfe_u32 v38, v20, 8, 8
-; SI-NEXT:    v_bfe_u32 v33, v4, 8, 8
-; SI-NEXT:    v_bfe_u32 v29, v3, 8, 8
-; SI-NEXT:    v_bfe_u32 v60, v1, 8, 8
-; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; SI-NEXT:    v_or_b32_e32 v19, v19, v22
-; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v23
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 8, v19
-; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; SI-NEXT:    v_or_b32_e32 v24, v22, v21
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v6
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
-; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v25
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
-; SI-NEXT:    v_add_f32_e32 v23, 0x38000000, v23
-; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; SI-NEXT:    v_or_b32_e32 v23, v23, v22
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 8, v23
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
-; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
-; SI-NEXT:    v_or_b32_e32 v37, v22, v21
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v7
-; SI-NEXT:    v_or_b32_e32 v32, v25, v21
-; SI-NEXT:    v_alignbit_b32 v21, v32, v37, 24
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v21, v32, v37, 16
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v21, v32, v37, 8
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v21, v23, v24, 24
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v21, v23, v24, 16
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v21, v23, v24, 8
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v21, v19, v18, 24
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v21, v19, v18, 16
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v21, v19, v18, 8
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v21, v17, v16, 24
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v21, v17, v16, 16
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v21, v17, v16, 8
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v21, v11, v10, 24
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v21, v11, v10, 16
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v21, v8, v9, 24
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v21, v8, v9, 16
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v21, v8, v9, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v52, 8, v32
-; SI-NEXT:    v_bfe_u32 v22, v2, 8, 8
-; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
+; SI-NEXT:    s_lshl_b32 s17, s17, 16
+; SI-NEXT:    v_readfirstlane_b32 s18, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT:    s_or_b32 s17, s18, s17
+; SI-NEXT:    v_readfirstlane_b32 s18, v11
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    v_readfirstlane_b32 s19, v9
+; SI-NEXT:    s_or_b32 s18, s19, s18
+; SI-NEXT:    v_readfirstlane_b32 s19, v7
+; SI-NEXT:    s_lshl_b32 s19, s19, 16
+; SI-NEXT:    v_readfirstlane_b32 s20, v8
+; SI-NEXT:    s_or_b32 s19, s20, s19
+; SI-NEXT:    s_lshr_b64 s[20:21], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[22:23], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[26:27], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[24:25], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[28:29], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[16:17], 8
+; SI-NEXT:    s_lshr_b64 s[40:41], s[14:15], 24
+; SI-NEXT:    s_lshr_b64 s[44:45], s[14:15], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[14:15], 8
+; SI-NEXT:    s_lshr_b64 s[46:47], s[12:13], 24
+; SI-NEXT:    s_lshr_b64 s[58:59], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[12:13], 8
+; SI-NEXT:    s_lshr_b64 s[60:61], s[10:11], 24
+; SI-NEXT:    s_lshr_b64 s[72:73], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[10:11], 8
+; SI-NEXT:    s_lshr_b64 s[74:75], s[8:9], 24
+; SI-NEXT:    s_lshr_b64 s[78:79], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[90:91], s[8:9], 8
+; SI-NEXT:    s_lshr_b64 s[88:89], s[6:7], 24
+; SI-NEXT:    s_lshr_b64 s[92:93], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[94:95], s[6:7], 8
+; SI-NEXT:    s_lshr_b64 s[30:31], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[34:35], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[36:37], s[4:5], 8
+; SI-NEXT:    s_lshr_b32 s45, s19, 8
+; SI-NEXT:    s_lshr_b32 s43, s17, 8
+; SI-NEXT:    s_lshr_b32 s41, s15, 8
+; SI-NEXT:    s_lshr_b32 s29, s13, 8
+; SI-NEXT:    s_lshr_b32 s27, s11, 8
+; SI-NEXT:    s_lshr_b32 s25, s9, 8
+; SI-NEXT:    s_lshr_b32 s23, s7, 8
+; SI-NEXT:    s_lshr_b32 s21, s5, 8
+; SI-NEXT:    v_bfe_u32 v48, v7, 8, 8
+; SI-NEXT:    v_bfe_u32 v39, v6, 8, 8
+; SI-NEXT:    v_bfe_u32 v36, v5, 8, 8
+; SI-NEXT:    v_bfe_u32 v34, v20, 8, 8
+; SI-NEXT:    v_bfe_u32 v32, v4, 8, 8
+; SI-NEXT:    v_bfe_u32 v30, v3, 8, 8
+; SI-NEXT:    v_bfe_u32 v19, v2, 8, 8
+; SI-NEXT:    v_bfe_u32 v18, v1, 8, 8
 ; SI-NEXT:  .LBB105_3: ; %end
-; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v21, 0xff, v37
+; SI-NEXT:    s_and_b32 s18, s18, 0xff
+; SI-NEXT:    s_lshl_b32 s26, s26, 8
+; SI-NEXT:    s_and_b32 s22, s22, 0xff
+; SI-NEXT:    s_or_b32 s18, s18, s26
+; SI-NEXT:    s_lshl_b32 s22, s22, 16
+; SI-NEXT:    s_lshl_b32 s20, s20, 24
+; SI-NEXT:    s_and_b32 s18, s18, 0xffff
+; SI-NEXT:    s_or_b32 s20, s20, s22
+; SI-NEXT:    s_or_b32 s18, s18, s20
+; SI-NEXT:    v_mov_b32_e32 v8, s18
+; SI-NEXT:    s_and_b32 s18, s19, 0xff
+; SI-NEXT:    s_lshl_b32 s19, s45, 8
 ; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; SI-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
+; SI-NEXT:    s_or_b32 s18, s18, s19
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 8, v25
-; SI-NEXT:    v_or_b32_e32 v21, v21, v25
-; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v26, 24, v26
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v25, 0xff, v25
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; SI-NEXT:    v_or_b32_e32 v25, v26, v25
-; SI-NEXT:    v_or_b32_e32 v21, v21, v25
-; SI-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v21, 0xff, v32
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 8, v52
-; SI-NEXT:    v_or_b32_e32 v21, v21, v25
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 24, v54
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; SI-NEXT:    v_or_b32_e32 v7, v25, v7
-; SI-NEXT:    v_or_b32_e32 v7, v21, v7
-; SI-NEXT:    v_add_i32_e32 v21, vcc, 4, v0
-; SI-NEXT:    buffer_store_dword v7, v21, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 24, v48
+; SI-NEXT:    s_and_b32 s18, s18, 0xffff
+; SI-NEXT:    v_or_b32_e32 v7, v8, v7
+; SI-NEXT:    v_or_b32_e32 v7, s18, v7
+; SI-NEXT:    s_and_b32 s16, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s42, 8
+; SI-NEXT:    s_or_b32 s16, s16, s18
+; SI-NEXT:    s_and_b32 s18, s28, 0xff
+; SI-NEXT:    s_lshl_b32 s18, s18, 16
+; SI-NEXT:    s_lshl_b32 s19, s24, 24
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s18, s19, s18
+; SI-NEXT:    v_add_i32_e32 v8, vcc, 4, v0
+; SI-NEXT:    s_or_b32 s16, s16, s18
+; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v24
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 8, v21
-; SI-NEXT:    v_or_b32_e32 v7, v7, v21
-; SI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 24, v24
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v21, 0xff, v21
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; SI-NEXT:    v_or_b32_e32 v21, v24, v21
-; SI-NEXT:    v_or_b32_e32 v7, v7, v21
-; SI-NEXT:    v_add_i32_e32 v21, vcc, 8, v0
-; SI-NEXT:    buffer_store_dword v7, v21, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v23
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 8, v39
-; SI-NEXT:    v_or_b32_e32 v7, v7, v21
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 24, v51
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; SI-NEXT:    v_or_b32_e32 v6, v21, v6
+; SI-NEXT:    v_add_i32_e32 v7, vcc, 8, v0
+; SI-NEXT:    v_mov_b32_e32 v8, s16
+; SI-NEXT:    s_and_b32 s16, s17, 0xff
+; SI-NEXT:    s_lshl_b32 s17, s43, 8
+; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; SI-NEXT:    buffer_store_dword v8, v7, s[0:3], 0 offen
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v39
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
 ; SI-NEXT:    v_or_b32_e32 v6, v7, v6
+; SI-NEXT:    v_or_b32_e32 v6, s16, v6
+; SI-NEXT:    s_and_b32 s14, s14, 0xff
+; SI-NEXT:    s_lshl_b32 s16, s56, 8
+; SI-NEXT:    s_or_b32 s14, s14, s16
+; SI-NEXT:    s_and_b32 s16, s44, 0xff
+; SI-NEXT:    s_lshl_b32 s16, s16, 16
+; SI-NEXT:    s_lshl_b32 s17, s40, 24
+; SI-NEXT:    s_and_b32 s14, s14, 0xffff
+; SI-NEXT:    s_or_b32 s16, s17, s16
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 12, v0
-; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v18
-; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; SI-NEXT:    v_or_b32_e32 v6, v6, v7
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 24, v18
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_or_b32_e32 v7, v18, v7
-; SI-NEXT:    v_or_b32_e32 v6, v6, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 16, v0
+; SI-NEXT:    s_or_b32 s14, s14, s16
 ; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v19
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v36
-; SI-NEXT:    v_or_b32_e32 v6, v6, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v49
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; SI-NEXT:    v_or_b32_e32 v5, v7, v5
+; SI-NEXT:    v_add_i32_e32 v6, vcc, 16, v0
+; SI-NEXT:    v_mov_b32_e32 v7, s14
+; SI-NEXT:    s_and_b32 s14, s15, 0xff
+; SI-NEXT:    s_lshl_b32 s15, s41, 8
+; SI-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; SI-NEXT:    buffer_store_dword v7, v6, s[0:3], 0 offen
+; SI-NEXT:    s_or_b32 s14, s14, s15
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v36
+; SI-NEXT:    s_and_b32 s14, s14, 0xffff
 ; SI-NEXT:    v_or_b32_e32 v5, v6, v5
+; SI-NEXT:    v_or_b32_e32 v5, s14, v5
+; SI-NEXT:    s_and_b32 s12, s12, 0xff
+; SI-NEXT:    s_lshl_b32 s14, s62, 8
+; SI-NEXT:    s_or_b32 s12, s12, s14
+; SI-NEXT:    s_and_b32 s14, s58, 0xff
+; SI-NEXT:    s_lshl_b32 s14, s14, 16
+; SI-NEXT:    s_lshl_b32 s15, s46, 24
+; SI-NEXT:    s_and_b32 s12, s12, 0xffff
+; SI-NEXT:    s_or_b32 s14, s15, s14
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 20, v0
+; SI-NEXT:    s_or_b32 s12, s12, s14
 ; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v16
-; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v7
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_or_b32_e32 v6, v7, v6
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 24, v0
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 24, v0
+; SI-NEXT:    v_mov_b32_e32 v6, s12
+; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
+; SI-NEXT:    s_and_b32 s12, s13, 0xff
+; SI-NEXT:    s_lshl_b32 s13, s29, 8
+; SI-NEXT:    v_and_b32_e32 v5, 0xff, v20
+; SI-NEXT:    s_or_b32 s12, s12, s13
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v31
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v38
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; SI-NEXT:    v_or_b32_e32 v6, v7, v6
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v34
+; SI-NEXT:    s_and_b32 s12, s12, 0xffff
+; SI-NEXT:    v_or_b32_e32 v5, v6, v5
+; SI-NEXT:    v_or_b32_e32 v5, s12, v5
+; SI-NEXT:    s_and_b32 s10, s10, 0xff
+; SI-NEXT:    s_lshl_b32 s12, s76, 8
+; SI-NEXT:    s_or_b32 s10, s10, s12
+; SI-NEXT:    s_and_b32 s12, s72, 0xff
+; SI-NEXT:    s_lshl_b32 s12, s12, 16
+; SI-NEXT:    s_lshl_b32 s13, s60, 24
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_or_b32 s12, s13, s12
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s10, s10, s12
 ; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v61
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v58
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 24, v57
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; SI-NEXT:    v_or_b32_e32 v6, v7, v6
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 32, v0
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v28
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v33
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; SI-NEXT:    v_or_b32_e32 v4, v6, v4
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 32, v0
+; SI-NEXT:    v_mov_b32_e32 v6, s10
+; SI-NEXT:    s_and_b32 s10, s11, 0xff
+; SI-NEXT:    s_lshl_b32 s11, s27, 8
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v32
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
 ; SI-NEXT:    v_or_b32_e32 v4, v5, v4
+; SI-NEXT:    v_or_b32_e32 v4, s10, v4
+; SI-NEXT:    s_and_b32 s8, s8, 0xff
+; SI-NEXT:    s_lshl_b32 s10, s90, 8
+; SI-NEXT:    s_or_b32 s8, s8, s10
+; SI-NEXT:    s_and_b32 s10, s78, 0xff
+; SI-NEXT:    s_lshl_b32 s10, s10, 16
+; SI-NEXT:    s_lshl_b32 s11, s74, 24
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    s_or_b32 s10, s11, s10
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, 36, v0
+; SI-NEXT:    s_or_b32 s8, s8, s10
 ; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v56
-; SI-NEXT:    v_or_b32_e32 v4, v4, v5
-; SI-NEXT:    v_and_b32_e32 v5, 0xff, v47
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v44
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT:    v_or_b32_e32 v5, v6, v5
-; SI-NEXT:    v_or_b32_e32 v4, v4, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 40, v0
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v63
-; SI-NEXT:    v_or_b32_e32 v4, v4, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v29
-; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; SI-NEXT:    v_or_b32_e32 v3, v5, v3
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 40, v0
+; SI-NEXT:    v_mov_b32_e32 v5, s8
+; SI-NEXT:    s_and_b32 s8, s9, 0xff
+; SI-NEXT:    s_lshl_b32 s9, s25, 8
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 offen
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v30
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
 ; SI-NEXT:    v_or_b32_e32 v3, v4, v3
+; SI-NEXT:    v_or_b32_e32 v3, s8, v3
+; SI-NEXT:    s_and_b32 s6, s6, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s94, 8
+; SI-NEXT:    s_or_b32 s6, s6, s8
+; SI-NEXT:    s_and_b32 s8, s92, 0xff
+; SI-NEXT:    s_lshl_b32 s8, s8, 16
+; SI-NEXT:    s_lshl_b32 s9, s88, 24
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_or_b32 s8, s9, s8
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 44, v0
+; SI-NEXT:    s_or_b32 s6, s6, s8
 ; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v43
-; SI-NEXT:    v_or_b32_e32 v3, v3, v4
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
-; SI-NEXT:    v_or_b32_e32 v4, v5, v4
-; SI-NEXT:    v_or_b32_e32 v3, v3, v4
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 48, v0
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v59
-; SI-NEXT:    v_or_b32_e32 v3, v3, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v22
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v2, v4, v2
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
+; SI-NEXT:    v_mov_b32_e32 v4, s6
+; SI-NEXT:    s_and_b32 s6, s7, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s23, 8
+; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; SI-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v19
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
 ; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    v_or_b32_e32 v2, s6, v2
+; SI-NEXT:    s_and_b32 s4, s4, 0xff
+; SI-NEXT:    s_lshl_b32 s6, s36, 8
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    s_and_b32 s6, s34, 0xff
+; SI-NEXT:    s_lshl_b32 s6, s6, 16
+; SI-NEXT:    s_lshl_b32 s7, s30, 24
+; SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; SI-NEXT:    s_or_b32 s6, s7, s6
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v9
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_or_b32_e32 v3, v4, v3
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
+; SI-NEXT:    s_or_b32 s4, s4, s6
 ; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v45
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v60
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
+; SI-NEXT:    v_mov_b32_e32 v3, s4
+; SI-NEXT:    s_and_b32 s4, s5, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s21, 8
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
+; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 24, v18
+; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    v_or_b32_e32 v1, s4, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    v_readlane_b32 s37, v40, 5
+; SI-NEXT:    v_readlane_b32 s36, v40, 4
+; SI-NEXT:    v_readlane_b32 s35, v40, 3
+; SI-NEXT:    v_readlane_b32 s34, v40, 2
+; SI-NEXT:    v_readlane_b32 s31, v40, 1
+; SI-NEXT:    v_readlane_b32 s30, v40, 0
+; SI-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB105_4:
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; kill: killed $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; kill: killed $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr37
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr52
-; SI-NEXT:    ; implicit-def: $vgpr54
-; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $vgpr23
+; SI-NEXT:    ; implicit-def: $sgpr18
+; SI-NEXT:    ; implicit-def: $sgpr26
+; SI-NEXT:    ; implicit-def: $sgpr22
+; SI-NEXT:    ; implicit-def: $sgpr20
+; SI-NEXT:    ; implicit-def: $sgpr45
+; SI-NEXT:    ; implicit-def: $vgpr48
+; SI-NEXT:    ; implicit-def: $sgpr16
+; SI-NEXT:    ; implicit-def: $sgpr42
+; SI-NEXT:    ; implicit-def: $sgpr28
+; SI-NEXT:    ; implicit-def: $sgpr24
+; SI-NEXT:    ; implicit-def: $sgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    ; implicit-def: $vgpr51
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr19
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr56
+; SI-NEXT:    ; implicit-def: $sgpr44
+; SI-NEXT:    ; implicit-def: $sgpr40
+; SI-NEXT:    ; implicit-def: $sgpr41
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr38
-; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr61
-; SI-NEXT:    ; implicit-def: $vgpr58
-; SI-NEXT:    ; implicit-def: $vgpr57
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $vgpr56
-; SI-NEXT:    ; implicit-def: $vgpr47
-; SI-NEXT:    ; implicit-def: $vgpr44
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr63
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr43
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr59
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr60
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; kill: killed $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; kill: killed $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; kill: killed $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; kill: killed $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; kill: killed $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; kill: killed $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; kill: killed $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; kill: killed $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; kill: killed $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; kill: killed $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; kill: killed $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; kill: killed $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; kill: killed $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; kill: killed $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; kill: killed $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr8
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr62
+; SI-NEXT:    ; implicit-def: $sgpr58
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr29
+; SI-NEXT:    ; implicit-def: $vgpr34
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr76
+; SI-NEXT:    ; implicit-def: $sgpr72
+; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr27
+; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr90
+; SI-NEXT:    ; implicit-def: $sgpr78
+; SI-NEXT:    ; implicit-def: $sgpr74
+; SI-NEXT:    ; implicit-def: $sgpr25
+; SI-NEXT:    ; implicit-def: $vgpr30
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr94
+; SI-NEXT:    ; implicit-def: $sgpr92
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    ; implicit-def: $sgpr23
+; SI-NEXT:    ; implicit-def: $vgpr19
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr21
+; SI-NEXT:    ; implicit-def: $vgpr18
+; SI-NEXT:    ; implicit-def: $sgpr36
+; SI-NEXT:    ; implicit-def: $sgpr34
+; SI-NEXT:    ; implicit-def: $sgpr30
 ; SI-NEXT:    s_branch .LBB105_2
 ;
 ; VI-LABEL: bitcast_v32f16_to_v64i8_scalar:
@@ -85721,728 +86232,500 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_mul_f32_e64 v19, 1.0, s17
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_mul_f32_e32 v62, 1.0, v2
-; SI-NEXT:    v_mul_f32_e32 v23, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v38, 1.0, v4
-; SI-NEXT:    v_mul_f32_e32 v48, 1.0, v3
-; SI-NEXT:    v_mul_f32_e32 v34, 1.0, v6
-; SI-NEXT:    v_mul_f32_e32 v37, 1.0, v5
-; SI-NEXT:    v_mul_f32_e32 v54, 1.0, v8
-; SI-NEXT:    v_mul_f32_e32 v40, 1.0, v7
-; SI-NEXT:    v_mul_f32_e32 v51, 1.0, v10
-; SI-NEXT:    v_mul_f32_e32 v53, 1.0, v9
-; SI-NEXT:    v_mul_f32_e32 v44, 1.0, v12
-; SI-NEXT:    v_mul_f32_e32 v45, 1.0, v11
-; SI-NEXT:    v_mul_f32_e32 v42, 1.0, v14
-; SI-NEXT:    v_mul_f32_e32 v43, 1.0, v13
-; SI-NEXT:    v_mul_f32_e32 v57, 1.0, v16
-; SI-NEXT:    v_mul_f32_e32 v58, 1.0, v15
-; SI-NEXT:    v_mul_f32_e32 v47, 1.0, v18
-; SI-NEXT:    v_mul_f32_e32 v56, 1.0, v17
+; SI-NEXT:    v_mul_f32_e32 v33, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v27, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v50, 1.0, v4
+; SI-NEXT:    v_mul_f32_e32 v52, 1.0, v3
+; SI-NEXT:    v_mul_f32_e32 v39, 1.0, v6
+; SI-NEXT:    v_mul_f32_e32 v49, 1.0, v5
+; SI-NEXT:    v_mul_f32_e32 v44, 1.0, v8
+; SI-NEXT:    v_mul_f32_e32 v46, 1.0, v7
+; SI-NEXT:    v_mul_f32_e32 v40, 1.0, v10
+; SI-NEXT:    v_mul_f32_e32 v43, 1.0, v9
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_mul_f32_e32 v61, 1.0, v12
+; SI-NEXT:    v_mul_f32_e32 v25, 1.0, v11
+; SI-NEXT:    v_mul_f32_e32 v57, 1.0, v14
+; SI-NEXT:    v_mul_f32_e32 v60, 1.0, v13
+; SI-NEXT:    v_mul_f32_e32 v36, 1.0, v16
+; SI-NEXT:    v_mul_f32_e32 v37, 1.0, v15
+; SI-NEXT:    v_mul_f32_e32 v32, 1.0, v18
+; SI-NEXT:    v_mul_f32_e32 v34, 1.0, v17
 ; SI-NEXT:    v_mul_f32_e64 v3, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v46, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v26, 1.0, s19
 ; SI-NEXT:    v_mul_f32_e64 v2, 1.0, s18
 ; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v28, 1.0, s20
-; SI-NEXT:    v_mul_f32_e64 v61, 1.0, s23
+; SI-NEXT:    v_mul_f32_e64 v6, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v29, 1.0, s23
 ; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s22
-; SI-NEXT:    v_mul_f32_e64 v36, 1.0, s25
-; SI-NEXT:    v_mul_f32_e64 v39, 1.0, s24
-; SI-NEXT:    v_mul_f32_e64 v31, 1.0, s27
-; SI-NEXT:    v_mul_f32_e64 v35, 1.0, s26
-; SI-NEXT:    v_mul_f32_e64 v52, 1.0, s29
+; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s25
+; SI-NEXT:    v_mul_f32_e64 v38, 1.0, s24
+; SI-NEXT:    v_mul_f32_e64 v35, 1.0, s27
+; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s26
+; SI-NEXT:    v_mul_f32_e64 v53, 1.0, s29
 ; SI-NEXT:    v_mul_f32_e64 v55, 1.0, s28
-; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; SI-NEXT:    s_cbranch_scc0 .LBB109_2
+; SI-NEXT:    buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    s_cbranch_scc0 .LBB109_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v46
-; SI-NEXT:    v_alignbit_b32 v27, v1, v3, 16
-; SI-NEXT:    v_alignbit_b32 v30, v24, v2, 16
-; SI-NEXT:    v_alignbit_b32 v1, v30, v27, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v30, v27, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v30, v27, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_alignbit_b32 v23, v1, v3, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v61
-; SI-NEXT:    v_alignbit_b32 v21, v1, v28, 16
-; SI-NEXT:    v_alignbit_b32 v19, v17, v4, 16
-; SI-NEXT:    v_alignbit_b32 v1, v19, v21, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v19, v21, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v19, v21, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v36
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v31
-; SI-NEXT:    v_alignbit_b32 v15, v1, v39, 16
-; SI-NEXT:    v_alignbit_b32 v16, v13, v35, 16
-; SI-NEXT:    v_alignbit_b32 v1, v16, v15, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v16, v15, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v16, v15, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v52
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v62
-; SI-NEXT:    v_alignbit_b32 v10, v1, v55, 16
-; SI-NEXT:    v_alignbit_b32 v11, v9, v23, 16
-; SI-NEXT:    v_alignbit_b32 v1, v11, v10, 24
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v11, v10, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v1, v11, v10, 8
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v38
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v34
-; SI-NEXT:    v_alignbit_b32 v6, v1, v48, 16
-; SI-NEXT:    v_alignbit_b32 v20, v18, v37, 16
-; SI-NEXT:    v_alignbit_b32 v4, v20, v6, 24
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v54
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v4, v20, v6, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v51
-; SI-NEXT:    v_alignbit_b32 v3, v1, v40, 16
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v4, v20, v6, 8
-; SI-NEXT:    v_alignbit_b32 v14, v12, v53, 16
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v4, v14, v3, 24
+; SI-NEXT:    v_alignbit_b32 v20, v1, v6, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
+; SI-NEXT:    v_alignbit_b32 v17, v1, v38, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v53
+; SI-NEXT:    v_alignbit_b32 v14, v1, v55, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v50
+; SI-NEXT:    v_alignbit_b32 v11, v1, v52, 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v44
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v4, v14, v3, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v42
-; SI-NEXT:    v_alignbit_b32 v2, v1, v45, 16
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v4, v14, v3, 8
-; SI-NEXT:    v_alignbit_b32 v8, v7, v43, 16
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v4, v8, v2, 24
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v4, v8, v2, 16
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v4, v8, v2, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v57
-; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v47
-; SI-NEXT:    v_alignbit_b32 v1, v1, v58, 16
-; SI-NEXT:    v_alignbit_b32 v5, v4, v56, 16
-; SI-NEXT:    v_alignbit_b32 v25, v5, v1, 24
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 8, v14
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v25, v5, v1, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 8, v11
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 8, v8
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v25, v5, v1, 8
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 24, v34
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v29
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 24, v47
-; SI-NEXT:    s_mov_b64 s[4:5], 0
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v41, 24, v46
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 8, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 24, v61
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 8, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 24, v31
+; SI-NEXT:    v_alignbit_b32 v8, v1, v46, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v61
+; SI-NEXT:    v_alignbit_b32 v21, v19, v4, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v35
+; SI-NEXT:    v_alignbit_b32 v4, v1, v25, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v36
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v26
+; SI-NEXT:    v_alignbit_b32 v18, v16, v7, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v33
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v39
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v40
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v57
+; SI-NEXT:    v_alignbit_b32 v3, v1, v37, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v32
+; SI-NEXT:    v_alignbit_b32 v24, v22, v2, 16
+; SI-NEXT:    v_alignbit_b32 v15, v13, v27, 16
+; SI-NEXT:    v_alignbit_b32 v12, v10, v49, 16
+; SI-NEXT:    v_alignbit_b32 v9, v7, v43, 16
+; SI-NEXT:    v_alignbit_b32 v5, v6, v60, 16
+; SI-NEXT:    v_alignbit_b32 v2, v1, v34, 16
+; SI-NEXT:    v_readfirstlane_b32 s8, v23
+; SI-NEXT:    v_readfirstlane_b32 s9, v24
+; SI-NEXT:    v_readfirstlane_b32 s14, v20
+; SI-NEXT:    v_readfirstlane_b32 s15, v21
+; SI-NEXT:    v_readfirstlane_b32 s20, v17
+; SI-NEXT:    v_readfirstlane_b32 s21, v18
+; SI-NEXT:    v_readfirstlane_b32 s26, v14
+; SI-NEXT:    v_readfirstlane_b32 s27, v15
+; SI-NEXT:    v_readfirstlane_b32 s42, v11
+; SI-NEXT:    v_readfirstlane_b32 s43, v12
+; SI-NEXT:    v_readfirstlane_b32 s56, v8
+; SI-NEXT:    v_readfirstlane_b32 s57, v9
+; SI-NEXT:    v_readfirstlane_b32 s62, v4
+; SI-NEXT:    v_readfirstlane_b32 s63, v5
+; SI-NEXT:    v_readfirstlane_b32 s76, v3
+; SI-NEXT:    v_readfirstlane_b32 s77, v2
+; SI-NEXT:    s_lshr_b64 s[4:5], s[8:9], 24
+; SI-NEXT:    s_lshr_b64 s[6:7], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[8:9], 8
+; SI-NEXT:    s_lshr_b64 s[8:9], s[14:15], 24
+; SI-NEXT:    s_lshr_b64 s[12:13], s[14:15], 16
+; SI-NEXT:    s_lshr_b64 s[16:17], s[14:15], 8
+; SI-NEXT:    s_lshr_b64 s[14:15], s[20:21], 24
+; SI-NEXT:    s_lshr_b64 s[18:19], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[22:23], s[20:21], 8
+; SI-NEXT:    s_lshr_b64 s[20:21], s[26:27], 24
+; SI-NEXT:    s_lshr_b64 s[24:25], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[28:29], s[26:27], 8
+; SI-NEXT:    s_lshr_b64 s[26:27], s[42:43], 24
+; SI-NEXT:    s_lshr_b64 s[40:41], s[42:43], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[42:43], 8
+; SI-NEXT:    s_lshr_b64 s[42:43], s[56:57], 24
+; SI-NEXT:    s_lshr_b64 s[46:47], s[56:57], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[56:57], 8
+; SI-NEXT:    s_lshr_b64 s[56:57], s[62:63], 24
+; SI-NEXT:    s_lshr_b64 s[60:61], s[62:63], 16
+; SI-NEXT:    s_lshr_b64 s[72:73], s[62:63], 8
+; SI-NEXT:    s_lshr_b64 s[62:63], s[76:77], 24
+; SI-NEXT:    s_lshr_b64 s[74:75], s[76:77], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[76:77], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 24, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 8, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v62, 24, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v58, 8, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 24, v35
+; SI-NEXT:    v_lshrrev_b32_e32 v28, 8, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v63, 24, v33
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 8, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 24, v39
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 8, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 24, v40
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 8, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 24, v57
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 8, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 24, v32
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 8, v2
+; SI-NEXT:    s_cbranch_execnz .LBB109_3
+; SI-NEXT:  .LBB109_2: ; %cmp.true
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff0000, v53
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v55
+; SI-NEXT:    v_add_f32_e32 v14, 0x40c00000, v14
+; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; SI-NEXT:    v_alignbit_b32 v14, v14, v13, 16
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v38
+; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v16
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v36
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 8, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 24, v62
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v29, 8, v20
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 24, v51
-; SI-NEXT:    v_lshrrev_b32_e32 v63, 24, v42
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v60, 8, v5
-; SI-NEXT:    s_branch .LBB109_3
-; SI-NEXT:  .LBB109_2:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; kill: killed $vgpr4
-; SI-NEXT:    s_mov_b64 s[4:5], -1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; kill: killed $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    ; implicit-def: $vgpr50
-; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $vgpr41
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr19
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; implicit-def: $vgpr15
-; SI-NEXT:    ; implicit-def: $vgpr16
-; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr13
-; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr11
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $vgpr7
-; SI-NEXT:    ; implicit-def: $vgpr63
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr60
-; SI-NEXT:    ; implicit-def: $vgpr59
-; SI-NEXT:    ; kill: killed $vgpr59
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; kill: killed $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; kill: killed $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:  .LBB109_3: ; %Flow
-; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v59, v63
-; SI-NEXT:    v_mov_b32_e32 v63, v22
-; SI-NEXT:    s_cbranch_vccnz .LBB109_5
-; SI-NEXT:  ; %bb.4: ; %cmp.true
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v28
-; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v57
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v58
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff0000, v44
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff0000, v50
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v37
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v46
+; SI-NEXT:    v_add_f32_e32 v8, 0x40c00000, v8
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v52
+; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v11
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v47
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v56
-; SI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v3
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v29
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v44
-; SI-NEXT:    v_alignbit_b32 v5, v4, v2, 16
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v45
-; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v42
-; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v43
-; SI-NEXT:    v_add_f32_e32 v42, 0x40c00000, v6
-; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v42
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v54
-; SI-NEXT:    v_alignbit_b32 v8, v7, v3, 16
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v40
-; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v51
-; SI-NEXT:    v_alignbit_b32 v3, v6, v3, 16
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v53
-; SI-NEXT:    v_add_f32_e32 v51, 0x40c00000, v9
-; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v12, 16, v51
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v38
-; SI-NEXT:    v_alignbit_b32 v14, v12, v6, 16
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v48
-; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff0000, v36
-; SI-NEXT:    v_add_f32_e32 v6, 0x40c00000, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v34
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v39
-; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v15
-; SI-NEXT:    v_alignbit_b32 v6, v9, v6, 16
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v37
-; SI-NEXT:    v_add_f32_e32 v34, 0x40c00000, v10
-; SI-NEXT:    v_add_f32_e32 v13, 0x40c00000, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v34
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v52
-; SI-NEXT:    v_alignbit_b32 v15, v15, v13, 16
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v35
-; SI-NEXT:    v_alignbit_b32 v20, v18, v9, 16
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v55
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v61
+; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
 ; SI-NEXT:    v_add_f32_e32 v10, 0x40c00000, v10
-; SI-NEXT:    v_add_f32_e32 v16, 0x40c00000, v13
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v31
-; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; SI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v13
-; SI-NEXT:    v_alignbit_b32 v10, v10, v9, 16
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v23
-; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v26
-; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v9
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff0000, v62
-; SI-NEXT:    v_alignbit_b32 v16, v13, v16, 16
-; SI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v23
-; SI-NEXT:    v_alignbit_b32 v11, v9, v11, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 8, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 24, v26
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 24, v23
-; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; SI-NEXT:    v_alignbit_b32 v3, v2, v1, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v34
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff0000, v25
+; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT:    v_alignbit_b32 v8, v8, v7, 16
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v43
+; SI-NEXT:    v_alignbit_b32 v11, v11, v10, 16
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v49
+; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v32
+; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff0000, v57
+; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v7
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff0000, v40
+; SI-NEXT:    v_add_f32_e32 v12, 0x40c00000, v10
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff0000, v39
+; SI-NEXT:    v_add_f32_e32 v32, 0x40c00000, v1
+; SI-NEXT:    v_alignbit_b32 v4, v5, v4, 16
+; SI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v60
+; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v6
+; SI-NEXT:    v_add_f32_e32 v34, 0x40c00000, v7
+; SI-NEXT:    v_add_f32_e32 v36, 0x40c00000, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v32
+; SI-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v25
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v34
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v36
+; SI-NEXT:    v_alignbit_b32 v2, v1, v2, 16
+; SI-NEXT:    v_alignbit_b32 v5, v6, v5, 16
+; SI-NEXT:    v_alignbit_b32 v9, v7, v9, 16
+; SI-NEXT:    v_alignbit_b32 v12, v10, v12, 16
+; SI-NEXT:    v_readfirstlane_b32 s76, v3
+; SI-NEXT:    v_readfirstlane_b32 s77, v2
+; SI-NEXT:    v_readfirstlane_b32 s62, v4
+; SI-NEXT:    v_readfirstlane_b32 s63, v5
+; SI-NEXT:    v_readfirstlane_b32 s56, v8
+; SI-NEXT:    v_readfirstlane_b32 s57, v9
+; SI-NEXT:    v_readfirstlane_b32 s42, v11
+; SI-NEXT:    v_readfirstlane_b32 s43, v12
+; SI-NEXT:    v_readfirstlane_b32 s26, v14
+; SI-NEXT:    s_lshr_b64 s[40:41], s[42:43], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[42:43], 8
+; SI-NEXT:    s_lshr_b64 s[46:47], s[56:57], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[56:57], 8
+; SI-NEXT:    s_lshr_b64 s[60:61], s[62:63], 16
+; SI-NEXT:    s_lshr_b64 s[72:73], s[62:63], 8
+; SI-NEXT:    s_lshr_b64 s[74:75], s[76:77], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 8, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 8, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 8, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 8, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 24, v36
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 24, v34
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 24, v25
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 24, v32
+; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; SI-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; SI-NEXT:    v_alignbit_b32 v17, v17, v16, 16
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    v_readfirstlane_b32 s20, v17
+; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; SI-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v25
-; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; SI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
-; SI-NEXT:    v_alignbit_b32 v21, v19, v17, 16
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; SI-NEXT:    v_alignbit_b32 v27, v25, v24, 16
-; SI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshrrev_b32_e32 v63, 24, v51
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 24, v42
-; SI-NEXT:    v_lshrrev_b32_e32 v60, 8, v5
-; SI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v17
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; SI-NEXT:    v_add_f32_e32 v25, 0x40c00000, v24
-; SI-NEXT:    v_and_b32_e32 v24, 0xffff0000, v46
-; SI-NEXT:    v_add_f32_e32 v28, 0x40c00000, v24
-; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v28
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff0000, v61
-; SI-NEXT:    v_alignbit_b32 v30, v24, v25, 16
-; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v17
-; SI-NEXT:    v_alignbit_b32 v25, v30, v27, 24
-; SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v22
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v25, v30, v27, 16
-; SI-NEXT:    v_alignbit_b32 v19, v17, v19, 16
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v25, v30, v27, 8
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v25, v19, v21, 24
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v25, v19, v21, 16
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v25, v19, v21, 8
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v25, v16, v15, 24
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v25, v16, v15, 16
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v25, v16, v15, 8
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v25, v11, v10, 24
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v25, v11, v10, 16
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v25, v11, v10, 8
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v25, v20, v6, 24
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v25, v20, v6, 16
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v25, v20, v6, 8
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v25, v14, v3, 24
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v25, v14, v3, 16
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v25, v14, v3, 8
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v25, v8, v2, 24
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v25, v8, v2, 16
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v25, v8, v2, 8
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v25, v5, v1, 24
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 24, v22
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 24, v34
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v25, v5, v1, 16
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 24, v29
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_alignbit_b32 v25, v5, v1, 8
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 8, v14
-; SI-NEXT:    buffer_store_dword v25, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 8, v30
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 8, v19
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 8, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v41, 24, v28
-; SI-NEXT:    v_lshrrev_b32_e32 v29, 8, v20
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v22, 8, v8
-; SI-NEXT:    buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; SI-NEXT:  .LBB109_5: ; %end
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v22, 0xff, v27
-; SI-NEXT:    v_and_b32_e32 v21, 0xff, v21
-; SI-NEXT:    v_and_b32_e32 v19, 0xff, v19
-; SI-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT:    v_and_b32_e32 v15, 0xff, v15
-; SI-NEXT:    v_and_b32_e32 v13, 0xff, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_and_b32_e32 v10, 0xff, v10
-; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 8, v23
-; SI-NEXT:    v_or_b32_e32 v22, v22, v23
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; SI-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; SI-NEXT:    v_add_f32_e32 v23, 0x40c00000, v23
+; SI-NEXT:    v_add_f32_e32 v19, 0x40c00000, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; SI-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; SI-NEXT:    v_alignbit_b32 v20, v20, v19, 16
+; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    v_alignbit_b32 v23, v23, v22, 16
+; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; SI-NEXT:    v_add_f32_e32 v15, 0x40c00000, v13
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff0000, v33
+; SI-NEXT:    v_add_f32_e32 v18, 0x40c00000, v16
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff0000, v35
+; SI-NEXT:    v_add_f32_e32 v33, 0x40c00000, v13
+; SI-NEXT:    v_add_f32_e32 v31, 0x40c00000, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v33
+; SI-NEXT:    v_lshrrev_b32_e32 v16, 16, v31
+; SI-NEXT:    v_alignbit_b32 v15, v13, v15, 16
+; SI-NEXT:    v_alignbit_b32 v18, v16, v18, 16
+; SI-NEXT:    v_readfirstlane_b32 s27, v15
+; SI-NEXT:    v_readfirstlane_b32 s21, v18
+; SI-NEXT:    v_readfirstlane_b32 s14, v20
+; SI-NEXT:    v_readfirstlane_b32 s8, v23
+; SI-NEXT:    s_lshr_b64 s[18:19], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[22:23], s[20:21], 8
+; SI-NEXT:    s_lshr_b64 s[24:25], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[28:29], s[26:27], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v28, 8, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 8, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 24, v31
+; SI-NEXT:    v_lshrrev_b32_e32 v63, 24, v33
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 24, v27
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; SI-NEXT:    v_add_f32_e32 v21, 0x40c00000, v19
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff0000, v29
+; SI-NEXT:    v_add_f32_e32 v24, 0x40c00000, v22
+; SI-NEXT:    v_and_b32_e32 v22, 0xffff0000, v26
+; SI-NEXT:    v_add_f32_e32 v29, 0x40c00000, v19
+; SI-NEXT:    v_add_f32_e32 v26, 0x40c00000, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v19, 16, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v26
+; SI-NEXT:    v_alignbit_b32 v21, v19, v21, 16
+; SI-NEXT:    v_alignbit_b32 v24, v22, v24, 16
+; SI-NEXT:    v_readfirstlane_b32 s15, v21
+; SI-NEXT:    v_readfirstlane_b32 s9, v24
+; SI-NEXT:    s_lshr_b64 s[4:5], s[8:9], 24
+; SI-NEXT:    s_lshr_b64 s[6:7], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[8:9], 8
+; SI-NEXT:    s_lshr_b64 s[8:9], s[14:15], 24
+; SI-NEXT:    s_lshr_b64 s[12:13], s[14:15], 16
+; SI-NEXT:    s_lshr_b64 s[16:17], s[14:15], 8
+; SI-NEXT:    s_lshr_b64 s[14:15], s[20:21], 24
+; SI-NEXT:    s_lshr_b64 s[20:21], s[26:27], 24
+; SI-NEXT:    s_lshr_b64 s[26:27], s[42:43], 24
+; SI-NEXT:    s_lshr_b64 s[42:43], s[56:57], 24
+; SI-NEXT:    s_lshr_b64 s[56:57], s[62:63], 24
+; SI-NEXT:    s_lshr_b64 s[62:63], s[76:77], 24
+; SI-NEXT:    s_lshr_b64 s[76:77], s[76:77], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v27, 8, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v58, 8, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 24, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v62, 24, v29
+; SI-NEXT:  .LBB109_3: ; %end
 ; SI-NEXT:    v_and_b32_e32 v23, 0xff, v23
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; SI-NEXT:    v_or_b32_e32 v23, v27, v23
-; SI-NEXT:    v_or_b32_e32 v22, v22, v23
-; SI-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen
+; SI-NEXT:    s_lshl_b32 s5, s10, 8
+; SI-NEXT:    v_or_b32_e32 v23, s5, v23
+; SI-NEXT:    s_and_b32 s5, s6, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s5, 16
+; SI-NEXT:    s_lshl_b32 s4, s4, 24
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    v_or_b32_e32 v23, s4, v23
+; SI-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v22, 0xff, v30
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 8, v50
-; SI-NEXT:    v_or_b32_e32 v22, v22, v23
 ; SI-NEXT:    v_and_b32_e32 v23, 0xff, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 24, v41
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; SI-NEXT:    v_or_b32_e32 v23, v24, v23
-; SI-NEXT:    v_or_b32_e32 v22, v22, v23
-; SI-NEXT:    v_add_i32_e32 v23, vcc, 4, v0
-; SI-NEXT:    buffer_store_dword v22, v23, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 8, v22
-; SI-NEXT:    v_or_b32_e32 v21, v21, v22
-; SI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 24, v23
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v24, 8, v27
 ; SI-NEXT:    v_and_b32_e32 v22, 0xff, v22
+; SI-NEXT:    v_and_b32_e32 v20, 0xff, v20
+; SI-NEXT:    s_lshl_b32 s4, s16, 8
+; SI-NEXT:    v_or_b32_e32 v23, v23, v24
 ; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v24, 24, v30
+; SI-NEXT:    v_or_b32_e32 v20, s4, v20
+; SI-NEXT:    s_and_b32 s4, s12, 0xff
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; SI-NEXT:    v_or_b32_e32 v22, v24, v22
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    s_lshl_b32 s5, s8, 24
 ; SI-NEXT:    v_or_b32_e32 v22, v23, v22
-; SI-NEXT:    v_or_b32_e32 v21, v21, v22
+; SI-NEXT:    v_add_i32_e32 v23, vcc, 4, v0
+; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    buffer_store_dword v22, v23, s[0:3], 0 offen
+; SI-NEXT:    v_or_b32_e32 v20, s4, v20
+; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v22, vcc, 8, v0
-; SI-NEXT:    buffer_store_dword v21, v22, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v20, v22, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 8, v32
-; SI-NEXT:    v_or_b32_e32 v19, v19, v21
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 24, v49
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; SI-NEXT:    v_or_b32_e32 v17, v21, v17
-; SI-NEXT:    v_or_b32_e32 v17, v19, v17
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 12, v0
+; SI-NEXT:    v_and_b32_e32 v20, 0xff, v21
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 8, v58
+; SI-NEXT:    v_and_b32_e32 v19, 0xff, v19
+; SI-NEXT:    v_and_b32_e32 v17, 0xff, v17
+; SI-NEXT:    s_lshl_b32 s4, s22, 8
+; SI-NEXT:    v_or_b32_e32 v20, v20, v21
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 24, v62
+; SI-NEXT:    v_or_b32_e32 v17, s4, v17
+; SI-NEXT:    s_and_b32 s4, s18, 0xff
+; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; SI-NEXT:    v_or_b32_e32 v19, v21, v19
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    s_lshl_b32 s5, s14, 24
+; SI-NEXT:    v_or_b32_e32 v19, v20, v19
+; SI-NEXT:    v_add_i32_e32 v20, vcc, 12, v0
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
+; SI-NEXT:    v_or_b32_e32 v17, s4, v17
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_add_i32_e32 v19, vcc, 16, v0
 ; SI-NEXT:    buffer_store_dword v17, v19, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 8, v17
-; SI-NEXT:    v_or_b32_e32 v15, v15, v17
-; SI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 24, v19
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v17, 0xff, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT:    v_or_b32_e32 v17, v19, v17
-; SI-NEXT:    v_or_b32_e32 v15, v15, v17
-; SI-NEXT:    v_add_i32_e32 v17, vcc, 16, v0
-; SI-NEXT:    buffer_store_dword v15, v17, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v17, 0xff, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 8, v28
+; SI-NEXT:    v_and_b32_e32 v16, 0xff, v16
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v14
+; SI-NEXT:    s_lshl_b32 s4, s28, 8
+; SI-NEXT:    v_or_b32_e32 v17, v17, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 24, v31
+; SI-NEXT:    v_or_b32_e32 v14, s4, v14
+; SI-NEXT:    s_and_b32 s4, s24, 0xff
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; SI-NEXT:    v_or_b32_e32 v16, v18, v16
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    s_lshl_b32 s5, s20, 24
+; SI-NEXT:    v_or_b32_e32 v16, v17, v16
+; SI-NEXT:    v_add_i32_e32 v17, vcc, 20, v0
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
+; SI-NEXT:    v_or_b32_e32 v14, s4, v14
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v15, 0xff, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 8, v25
-; SI-NEXT:    v_or_b32_e32 v15, v15, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 24, v33
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
-; SI-NEXT:    v_or_b32_e32 v13, v16, v13
-; SI-NEXT:    v_or_b32_e32 v13, v15, v13
-; SI-NEXT:    v_add_i32_e32 v15, vcc, 20, v0
-; SI-NEXT:    buffer_store_dword v13, v15, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v16, vcc, 24, v0
+; SI-NEXT:    buffer_store_dword v14, v16, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 8, v13
-; SI-NEXT:    v_or_b32_e32 v10, v10, v13
-; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 24, v15
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v14, 0xff, v15
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 8, v59
 ; SI-NEXT:    v_and_b32_e32 v13, 0xff, v13
+; SI-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; SI-NEXT:    s_lshl_b32 s4, s44, 8
+; SI-NEXT:    v_or_b32_e32 v14, v14, v15
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 24, v63
+; SI-NEXT:    v_or_b32_e32 v11, s4, v11
+; SI-NEXT:    s_and_b32 s4, s40, 0xff
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v14
 ; SI-NEXT:    v_or_b32_e32 v13, v15, v13
-; SI-NEXT:    v_or_b32_e32 v10, v10, v13
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 24, v0
-; SI-NEXT:    buffer_store_dword v10, v13, s[0:3], 0 offen
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    s_lshl_b32 s5, s26, 24
+; SI-NEXT:    v_or_b32_e32 v13, v14, v13
+; SI-NEXT:    v_add_i32_e32 v14, vcc, 28, v0
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
+; SI-NEXT:    v_or_b32_e32 v11, s4, v11
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v10, 0xff, v11
-; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 8, v11
-; SI-NEXT:    v_or_b32_e32 v10, v10, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 24, v26
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; SI-NEXT:    v_or_b32_e32 v9, v11, v9
-; SI-NEXT:    v_or_b32_e32 v9, v10, v9
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 28, v0
-; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v13, vcc, 32, v0
+; SI-NEXT:    buffer_store_dword v11, v13, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v9
-; SI-NEXT:    v_or_b32_e32 v6, v6, v9
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v9, 0xff, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_or_b32_e32 v9, v10, v9
-; SI-NEXT:    v_or_b32_e32 v6, v6, v9
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 32, v0
-; SI-NEXT:    buffer_store_dword v6, v9, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    v_and_b32_e32 v11, 0xff, v12
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 8, v47
+; SI-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; SI-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; SI-NEXT:    s_lshl_b32 s4, s58, 8
+; SI-NEXT:    v_or_b32_e32 v11, v11, v12
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 24, v56
+; SI-NEXT:    v_or_b32_e32 v8, s4, v8
+; SI-NEXT:    s_and_b32 s4, s46, 0xff
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT:    v_or_b32_e32 v10, v12, v10
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    s_lshl_b32 s5, s42, 24
+; SI-NEXT:    v_or_b32_e32 v10, v11, v10
+; SI-NEXT:    v_add_i32_e32 v11, vcc, 36, v0
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
+; SI-NEXT:    v_or_b32_e32 v8, s4, v8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v29
-; SI-NEXT:    v_or_b32_e32 v6, v6, v9
-; SI-NEXT:    v_and_b32_e32 v9, 0xff, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 24, v10
-; SI-NEXT:    v_or_b32_e32 v9, v10, v9
-; SI-NEXT:    v_or_b32_e32 v6, v6, v9
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 36, v0
-; SI-NEXT:    buffer_store_dword v6, v9, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v10, vcc, 40, v0
+; SI-NEXT:    buffer_store_dword v8, v10, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
-; SI-NEXT:    v_or_b32_e32 v3, v3, v6
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 24, v9
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_or_b32_e32 v6, v9, v6
-; SI-NEXT:    v_or_b32_e32 v3, v3, v6
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 40, v0
-; SI-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 24, v63
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
-; SI-NEXT:    v_or_b32_e32 v3, v3, v6
-; SI-NEXT:    v_and_b32_e32 v6, 0xff, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_or_b32_e32 v6, v9, v6
-; SI-NEXT:    v_or_b32_e32 v3, v3, v6
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 44, v0
-; SI-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v8, 0xff, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 8, v41
+; SI-NEXT:    v_and_b32_e32 v7, 0xff, v7
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; SI-NEXT:    s_lshl_b32 s4, s72, 8
+; SI-NEXT:    v_or_b32_e32 v8, v8, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 24, v45
+; SI-NEXT:    v_or_b32_e32 v4, s4, v4
+; SI-NEXT:    s_and_b32 s4, s60, 0xff
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT:    v_or_b32_e32 v7, v9, v7
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    s_lshl_b32 s5, s56, 24
+; SI-NEXT:    v_or_b32_e32 v7, v8, v7
+; SI-NEXT:    v_add_i32_e32 v8, vcc, 44, v0
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
+; SI-NEXT:    v_or_b32_e32 v4, s4, v4
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v6
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_or_b32_e32 v3, v6, v3
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_i32_e32 v7, vcc, 48, v0
+; SI-NEXT:    buffer_store_dword v4, v7, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v59
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_and_b32_e32 v3, 0xff, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_or_b32_e32 v3, v6, v3
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v4, 0xff, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v54
+; SI-NEXT:    v_or_b32_e32 v4, v4, v5
+; SI-NEXT:    v_and_b32_e32 v5, 0xff, v6
+; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; SI-NEXT:    s_lshl_b32 s4, s76, 8
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 24, v42
+; SI-NEXT:    v_or_b32_e32 v3, s4, v3
+; SI-NEXT:    s_and_b32 s4, s74, 0xff
+; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; SI-NEXT:    v_or_b32_e32 v5, v6, v5
+; SI-NEXT:    s_lshl_b32 s4, s4, 16
+; SI-NEXT:    s_lshl_b32 s5, s62, 24
+; SI-NEXT:    v_or_b32_e32 v4, v4, v5
+; SI-NEXT:    v_add_i32_e32 v5, vcc, 52, v0
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
+; SI-NEXT:    v_or_b32_e32 v3, s4, v3
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 56, v0
+; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xff, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v60
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_and_b32_e32 v2, 0xff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v48
+; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v51
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 60, v0
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -86462,6 +86745,72 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a,
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
+; SI-NEXT:  .LBB109_4:
+; SI-NEXT:    ; implicit-def: $vgpr23
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $vgpr24
+; SI-NEXT:    ; implicit-def: $vgpr27
+; SI-NEXT:    ; implicit-def: $vgpr22
+; SI-NEXT:    ; implicit-def: $vgpr30
+; SI-NEXT:    ; implicit-def: $vgpr20
+; SI-NEXT:    ; implicit-def: $sgpr16
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $vgpr21
+; SI-NEXT:    ; implicit-def: $vgpr58
+; SI-NEXT:    ; implicit-def: $vgpr19
+; SI-NEXT:    ; implicit-def: $vgpr62
+; SI-NEXT:    ; implicit-def: $vgpr17
+; SI-NEXT:    ; implicit-def: $sgpr22
+; SI-NEXT:    ; implicit-def: $sgpr18
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $vgpr18
+; SI-NEXT:    ; implicit-def: $vgpr28
+; SI-NEXT:    ; implicit-def: $vgpr16
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr14
+; SI-NEXT:    ; implicit-def: $sgpr28
+; SI-NEXT:    ; implicit-def: $sgpr24
+; SI-NEXT:    ; implicit-def: $sgpr20
+; SI-NEXT:    ; implicit-def: $vgpr15
+; SI-NEXT:    ; implicit-def: $vgpr59
+; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $vgpr63
+; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $sgpr44
+; SI-NEXT:    ; implicit-def: $sgpr40
+; SI-NEXT:    ; implicit-def: $sgpr26
+; SI-NEXT:    ; implicit-def: $vgpr12
+; SI-NEXT:    ; implicit-def: $vgpr47
+; SI-NEXT:    ; implicit-def: $vgpr10
+; SI-NEXT:    ; implicit-def: $vgpr56
+; SI-NEXT:    ; implicit-def: $vgpr8
+; SI-NEXT:    ; implicit-def: $sgpr58
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr42
+; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; implicit-def: $sgpr72
+; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr56
+; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $sgpr76
+; SI-NEXT:    ; implicit-def: $sgpr74
+; SI-NEXT:    ; implicit-def: $sgpr62
+; SI-NEXT:    ; implicit-def: $vgpr54
+; SI-NEXT:    ; implicit-def: $vgpr6
+; SI-NEXT:    ; implicit-def: $vgpr42
+; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $vgpr2
+; SI-NEXT:    ; implicit-def: $vgpr48
+; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    s_branch .LBB109_2
 ;
 ; VI-LABEL: bitcast_v32bf16_to_v64i8_scalar:
 ; VI:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
index e66762f1e02c2..a1c0a87b65e02 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll
@@ -2760,216 +2760,214 @@ define inreg <36 x i16> @bitcast_v18i32_to_v36i16_scalar(<18 x i32> inreg %a, i3
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; SI-NEXT:    v_readfirstlane_b32 s9, v1
-; SI-NEXT:    v_readfirstlane_b32 s8, v2
-; SI-NEXT:    v_readfirstlane_b32 s7, v3
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_readfirstlane_b32 s6, v4
+; SI-NEXT:    v_readfirstlane_b32 s6, v1
+; SI-NEXT:    v_readfirstlane_b32 s7, v2
+; SI-NEXT:    v_readfirstlane_b32 s4, v3
+; SI-NEXT:    s_and_b64 s[8:9], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s5, v4
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s28
-; SI-NEXT:    v_mov_b32_e32 v4, s26
-; SI-NEXT:    v_mov_b32_e32 v5, s24
-; SI-NEXT:    v_mov_b32_e32 v6, s22
-; SI-NEXT:    v_mov_b32_e32 v7, s20
-; SI-NEXT:    v_mov_b32_e32 v8, s18
-; SI-NEXT:    v_mov_b32_e32 v9, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s29, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s27, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s25, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s23, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s21, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s19, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s17, v9, 16
-; SI-NEXT:    s_lshr_b32 s10, s6, 16
-; SI-NEXT:    s_lshr_b32 s11, s8, 16
-; SI-NEXT:    s_lshr_b32 s12, s29, 16
-; SI-NEXT:    s_lshr_b32 s13, s27, 16
-; SI-NEXT:    s_lshr_b32 s14, s25, 16
-; SI-NEXT:    s_lshr_b32 s15, s23, 16
-; SI-NEXT:    s_lshr_b32 s40, s21, 16
-; SI-NEXT:    s_lshr_b32 s41, s19, 16
-; SI-NEXT:    s_lshr_b32 s42, s17, 16
+; SI-NEXT:    s_lshr_b32 s60, s5, 16
+; SI-NEXT:    s_lshr_b32 s61, s7, 16
+; SI-NEXT:    s_lshr_b32 s62, s29, 16
+; SI-NEXT:    s_lshr_b32 s63, s27, 16
+; SI-NEXT:    s_lshr_b32 s72, s25, 16
+; SI-NEXT:    s_lshr_b32 s73, s23, 16
+; SI-NEXT:    s_lshr_b32 s74, s21, 16
+; SI-NEXT:    s_lshr_b32 s75, s19, 16
+; SI-NEXT:    s_lshr_b32 s76, s17, 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB13_3
 ; SI-NEXT:  .LBB13_2: ; %cmp.true
+; SI-NEXT:    s_add_i32 s17, s17, 3
 ; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    s_add_i32 s19, s19, 3
 ; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_add_i32 s21, s21, 3
 ; SI-NEXT:    s_add_i32 s20, s20, 3
+; SI-NEXT:    s_add_i32 s23, s23, 3
 ; SI-NEXT:    s_add_i32 s22, s22, 3
+; SI-NEXT:    s_add_i32 s25, s25, 3
 ; SI-NEXT:    s_add_i32 s24, s24, 3
+; SI-NEXT:    s_add_i32 s27, s27, 3
 ; SI-NEXT:    s_add_i32 s26, s26, 3
+; SI-NEXT:    s_add_i32 s29, s29, 3
 ; SI-NEXT:    s_add_i32 s28, s28, 3
-; SI-NEXT:    s_add_i32 s9, s9, 3
 ; SI-NEXT:    s_add_i32 s7, s7, 3
-; SI-NEXT:    s_add_i32 s17, s17, 3
-; SI-NEXT:    s_add_i32 s19, s19, 3
-; SI-NEXT:    s_add_i32 s21, s21, 3
-; SI-NEXT:    s_add_i32 s23, s23, 3
-; SI-NEXT:    s_add_i32 s25, s25, 3
-; SI-NEXT:    s_add_i32 s27, s27, 3
-; SI-NEXT:    s_add_i32 s29, s29, 3
-; SI-NEXT:    s_add_i32 s8, s8, 3
 ; SI-NEXT:    s_add_i32 s6, s6, 3
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s28
-; SI-NEXT:    v_mov_b32_e32 v4, s26
-; SI-NEXT:    v_mov_b32_e32 v5, s24
-; SI-NEXT:    v_mov_b32_e32 v6, s22
-; SI-NEXT:    v_mov_b32_e32 v7, s20
-; SI-NEXT:    v_mov_b32_e32 v8, s18
-; SI-NEXT:    v_mov_b32_e32 v9, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s29, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s27, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s25, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s23, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s21, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s19, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s17, v9, 16
-; SI-NEXT:    s_lshr_b32 s10, s6, 16
-; SI-NEXT:    s_lshr_b32 s11, s8, 16
-; SI-NEXT:    s_lshr_b32 s12, s29, 16
-; SI-NEXT:    s_lshr_b32 s13, s27, 16
-; SI-NEXT:    s_lshr_b32 s14, s25, 16
-; SI-NEXT:    s_lshr_b32 s15, s23, 16
-; SI-NEXT:    s_lshr_b32 s40, s21, 16
-; SI-NEXT:    s_lshr_b32 s41, s19, 16
-; SI-NEXT:    s_lshr_b32 s42, s17, 16
+; SI-NEXT:    s_add_i32 s5, s5, 3
+; SI-NEXT:    s_add_i32 s4, s4, 3
+; SI-NEXT:    s_lshr_b64 s[8:9], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[16:17], 16
+; SI-NEXT:    s_lshr_b32 s60, s5, 16
+; SI-NEXT:    s_lshr_b32 s61, s7, 16
+; SI-NEXT:    s_lshr_b32 s62, s29, 16
+; SI-NEXT:    s_lshr_b32 s63, s27, 16
+; SI-NEXT:    s_lshr_b32 s72, s25, 16
+; SI-NEXT:    s_lshr_b32 s73, s23, 16
+; SI-NEXT:    s_lshr_b32 s74, s21, 16
+; SI-NEXT:    s_lshr_b32 s75, s19, 16
+; SI-NEXT:    s_lshr_b32 s76, s17, 16
 ; SI-NEXT:  .LBB13_3: ; %end
-; SI-NEXT:    s_and_b32 s4, s16, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_or_b32_e32 v9, s4, v9
-; SI-NEXT:    s_and_b32 s4, s17, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s42, 16
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_mov_b32_e32 v10, s4
-; SI-NEXT:    s_and_b32 s4, s18, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; SI-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 4, v0
-; SI-NEXT:    v_or_b32_e32 v8, s4, v8
-; SI-NEXT:    s_and_b32 s4, s19, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s41, 16
-; SI-NEXT:    buffer_store_dword v10, v9, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 8, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v8, v9, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v9, s4
-; SI-NEXT:    s_and_b32 s4, s20, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    s_lshl_b32 s9, s56, 16
+; SI-NEXT:    s_and_b32 s11, s16, 0xffff
+; SI-NEXT:    s_or_b32 s9, s11, s9
+; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    s_and_b32 s9, s17, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s76, 16
+; SI-NEXT:    s_or_b32 s9, s9, s11
+; SI-NEXT:    v_mov_b32_e32 v2, s9
+; SI-NEXT:    s_and_b32 s9, s18, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s46, 16
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 12, v0
-; SI-NEXT:    v_or_b32_e32 v7, s4, v7
-; SI-NEXT:    s_and_b32 s4, s21, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s40, 16
-; SI-NEXT:    buffer_store_dword v9, v8, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 16, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v8, s4
-; SI-NEXT:    s_and_b32 s4, s22, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; SI-NEXT:    s_or_b32 s9, s9, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 20, v0
-; SI-NEXT:    v_or_b32_e32 v6, s4, v6
-; SI-NEXT:    s_and_b32 s4, s23, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s15, 16
-; SI-NEXT:    buffer_store_dword v8, v7, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 24, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    s_and_b32 s4, s24, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 28, v0
-; SI-NEXT:    v_or_b32_e32 v5, s4, v5
-; SI-NEXT:    s_and_b32 s4, s25, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s14, 16
-; SI-NEXT:    buffer_store_dword v7, v6, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 32, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v6, s4
-; SI-NEXT:    s_and_b32 s4, s26, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_mov_b32_e32 v2, s9
+; SI-NEXT:    s_and_b32 s9, s19, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s75, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; SI-NEXT:    s_or_b32 s9, s9, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 36, v0
-; SI-NEXT:    v_or_b32_e32 v4, s4, v4
-; SI-NEXT:    s_and_b32 s4, s27, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s13, 16
-; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 40, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v5, s4
-; SI-NEXT:    s_and_b32 s4, s28, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_mov_b32_e32 v2, s9
+; SI-NEXT:    s_and_b32 s9, s20, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s44, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; SI-NEXT:    s_or_b32 s9, s9, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 44, v0
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s29, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s12, 16
-; SI-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v4, s4
-; SI-NEXT:    s_and_b32 s4, s9, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_mov_b32_e32 v2, s9
+; SI-NEXT:    s_and_b32 s9, s21, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s74, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; SI-NEXT:    s_or_b32 s9, s9, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
-; SI-NEXT:    v_or_b32_e32 v2, s4, v2
-; SI-NEXT:    s_and_b32 s4, s8, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s11, 16
-; SI-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v3, s4
-; SI-NEXT:    s_and_b32 s4, s7, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_mov_b32_e32 v2, s9
+; SI-NEXT:    s_and_b32 s9, s22, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s42, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 20, v0
+; SI-NEXT:    s_or_b32 s9, s9, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
-; SI-NEXT:    v_or_b32_e32 v1, s4, v1
-; SI-NEXT:    s_and_b32 s4, s6, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s10, 16
-; SI-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s9
+; SI-NEXT:    s_and_b32 s9, s23, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s73, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 24, v0
+; SI-NEXT:    s_or_b32 s9, s9, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s9
+; SI-NEXT:    s_and_b32 s9, s24, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s40, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s9, s9, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s9
+; SI-NEXT:    s_and_b32 s9, s25, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s72, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v0
+; SI-NEXT:    s_or_b32 s9, s9, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s9
+; SI-NEXT:    s_and_b32 s9, s26, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s14, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 36, v0
+; SI-NEXT:    s_or_b32 s9, s9, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s9
+; SI-NEXT:    s_and_b32 s9, s27, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s63, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 40, v0
+; SI-NEXT:    s_or_b32 s9, s9, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s9
+; SI-NEXT:    s_and_b32 s9, s28, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s12, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 44, v0
+; SI-NEXT:    s_or_b32 s9, s9, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s9
+; SI-NEXT:    s_and_b32 s9, s29, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s62, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 48, v0
+; SI-NEXT:    s_or_b32 s9, s9, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s9
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_lshl_b32 s9, s10, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 52, v0
+; SI-NEXT:    s_or_b32 s6, s6, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s6, s7, 0xffff
+; SI-NEXT:    s_lshl_b32 s7, s61, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 56, v0
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; SI-NEXT:    s_lshl_b32 s6, s8, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 60, v0
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    s_and_b32 s4, s5, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s60, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 64, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x44, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB13_4:
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $sgpr56
+; SI-NEXT:    ; implicit-def: $sgpr76
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr75
+; SI-NEXT:    ; implicit-def: $sgpr44
+; SI-NEXT:    ; implicit-def: $sgpr74
 ; SI-NEXT:    ; implicit-def: $sgpr42
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $sgpr41
-; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $sgpr73
 ; SI-NEXT:    ; implicit-def: $sgpr40
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $sgpr15
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $sgpr72
 ; SI-NEXT:    ; implicit-def: $sgpr14
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $sgpr13
-; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $sgpr63
 ; SI-NEXT:    ; implicit-def: $sgpr12
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $sgpr11
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr62
 ; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr61
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr60
 ; SI-NEXT:    s_branch .LBB13_2
 ;
 ; VI-LABEL: bitcast_v18i32_to_v36i16_scalar:
@@ -9746,207 +9744,207 @@ define inreg <36 x i16> @bitcast_v18f32_to_v36i16_scalar(<18 x float> inreg %a,
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; SI-NEXT:    v_mov_b32_e32 v19, s16
-; SI-NEXT:    v_mov_b32_e32 v18, s17
+; SI-NEXT:    v_mov_b32_e32 v18, s16
+; SI-NEXT:    v_mov_b32_e32 v19, s17
 ; SI-NEXT:    v_mov_b32_e32 v16, s18
-; SI-NEXT:    v_mov_b32_e32 v15, s19
+; SI-NEXT:    v_mov_b32_e32 v17, s19
 ; SI-NEXT:    v_mov_b32_e32 v14, s20
-; SI-NEXT:    v_mov_b32_e32 v13, s21
+; SI-NEXT:    v_mov_b32_e32 v15, s21
 ; SI-NEXT:    v_mov_b32_e32 v12, s22
-; SI-NEXT:    v_mov_b32_e32 v11, s23
+; SI-NEXT:    v_mov_b32_e32 v13, s23
 ; SI-NEXT:    v_mov_b32_e32 v10, s24
-; SI-NEXT:    v_mov_b32_e32 v9, s25
+; SI-NEXT:    v_mov_b32_e32 v11, s25
 ; SI-NEXT:    v_mov_b32_e32 v8, s26
-; SI-NEXT:    v_mov_b32_e32 v7, s27
+; SI-NEXT:    v_mov_b32_e32 v9, s27
 ; SI-NEXT:    v_mov_b32_e32 v6, s28
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mov_b32_e32 v5, s29
+; SI-NEXT:    v_mov_b32_e32 v7, s29
 ; SI-NEXT:    s_cbranch_scc0 .LBB29_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_alignbit_b32 v17, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v20, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v21, v5, v6, 16
-; SI-NEXT:    v_alignbit_b32 v22, v7, v8, 16
-; SI-NEXT:    v_alignbit_b32 v25, v9, v10, 16
-; SI-NEXT:    v_alignbit_b32 v27, v11, v12, 16
-; SI-NEXT:    v_alignbit_b32 v29, v13, v14, 16
-; SI-NEXT:    v_alignbit_b32 v31, v15, v16, 16
-; SI-NEXT:    v_alignbit_b32 v33, v18, v19, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v11
+; SI-NEXT:    v_lshr_b64 v[20:21], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[21:22], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[22:23], v[6:7], 16
+; SI-NEXT:    v_lshr_b64 v[23:24], v[8:9], 16
+; SI-NEXT:    v_lshr_b64 v[24:25], v[10:11], 16
+; SI-NEXT:    v_lshr_b64 v[25:26], v[12:13], 16
+; SI-NEXT:    v_lshr_b64 v[26:27], v[14:15], 16
+; SI-NEXT:    v_lshr_b64 v[27:28], v[16:17], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
 ; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
 ; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v19
+; SI-NEXT:    v_lshr_b64 v[28:29], v[18:19], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB29_3
 ; SI-NEXT:  .LBB29_2: ; %cmp.true
-; SI-NEXT:    v_add_f32_e32 v18, 1.0, v18
-; SI-NEXT:    v_add_f32_e32 v19, 1.0, v19
-; SI-NEXT:    v_add_f32_e32 v15, 1.0, v15
-; SI-NEXT:    v_add_f32_e32 v16, 1.0, v16
-; SI-NEXT:    v_add_f32_e32 v13, 1.0, v13
-; SI-NEXT:    v_add_f32_e32 v14, 1.0, v14
-; SI-NEXT:    v_add_f32_e32 v11, 1.0, v11
-; SI-NEXT:    v_add_f32_e32 v12, 1.0, v12
-; SI-NEXT:    v_add_f32_e32 v9, 1.0, v9
-; SI-NEXT:    v_add_f32_e32 v10, 1.0, v10
-; SI-NEXT:    v_add_f32_e32 v7, 1.0, v7
-; SI-NEXT:    v_add_f32_e32 v8, 1.0, v8
-; SI-NEXT:    v_add_f32_e32 v5, 1.0, v5
-; SI-NEXT:    v_add_f32_e32 v6, 1.0, v6
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_add_f32_e32 v4, 1.0, v4
 ; SI-NEXT:    v_add_f32_e32 v3, 1.0, v3
-; SI-NEXT:    v_alignbit_b32 v17, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v20, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v21, v5, v6, 16
-; SI-NEXT:    v_alignbit_b32 v22, v7, v8, 16
-; SI-NEXT:    v_alignbit_b32 v25, v9, v10, 16
-; SI-NEXT:    v_alignbit_b32 v27, v11, v12, 16
-; SI-NEXT:    v_alignbit_b32 v29, v13, v14, 16
-; SI-NEXT:    v_alignbit_b32 v31, v15, v16, 16
-; SI-NEXT:    v_alignbit_b32 v33, v18, v19, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v24, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v11
+; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[20:21], v[3:4], 16
+; SI-NEXT:    v_add_f32_e32 v7, 1.0, v7
+; SI-NEXT:    v_add_f32_e32 v6, 1.0, v6
+; SI-NEXT:    v_lshr_b64 v[21:22], v[1:2], 16
+; SI-NEXT:    v_add_f32_e32 v9, 1.0, v9
+; SI-NEXT:    v_add_f32_e32 v8, 1.0, v8
+; SI-NEXT:    v_lshr_b64 v[22:23], v[6:7], 16
+; SI-NEXT:    v_add_f32_e32 v11, 1.0, v11
+; SI-NEXT:    v_add_f32_e32 v10, 1.0, v10
+; SI-NEXT:    v_lshr_b64 v[23:24], v[8:9], 16
+; SI-NEXT:    v_add_f32_e32 v13, 1.0, v13
+; SI-NEXT:    v_add_f32_e32 v12, 1.0, v12
+; SI-NEXT:    v_lshr_b64 v[24:25], v[10:11], 16
+; SI-NEXT:    v_add_f32_e32 v15, 1.0, v15
+; SI-NEXT:    v_add_f32_e32 v14, 1.0, v14
+; SI-NEXT:    v_lshr_b64 v[25:26], v[12:13], 16
+; SI-NEXT:    v_add_f32_e32 v17, 1.0, v17
+; SI-NEXT:    v_add_f32_e32 v16, 1.0, v16
+; SI-NEXT:    v_lshr_b64 v[26:27], v[14:15], 16
+; SI-NEXT:    v_add_f32_e32 v19, 1.0, v19
+; SI-NEXT:    v_add_f32_e32 v18, 1.0, v18
+; SI-NEXT:    v_lshr_b64 v[27:28], v[16:17], 16
+; SI-NEXT:    v_lshr_b64 v[28:29], v[18:19], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
 ; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
 ; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v19
 ; SI-NEXT:  .LBB29_3: ; %end
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v33
-; SI-NEXT:    v_or_b32_e32 v19, v19, v33
-; SI-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
 ; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; SI-NEXT:    v_or_b32_e32 v18, v18, v28
+; SI-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v36
+; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v19
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v37
 ; SI-NEXT:    v_or_b32_e32 v18, v18, v19
 ; SI-NEXT:    v_add_i32_e32 v19, vcc, 4, v0
 ; SI-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v31
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v27
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v16
 ; SI-NEXT:    v_or_b32_e32 v16, v16, v18
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 8, v0
 ; SI-NEXT:    buffer_store_dword v16, v18, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v35
-; SI-NEXT:    v_or_b32_e32 v15, v15, v16
-; SI-NEXT:    v_add_i32_e32 v16, vcc, 12, v0
-; SI-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v17
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v36
+; SI-NEXT:    v_or_b32_e32 v16, v16, v17
+; SI-NEXT:    v_add_i32_e32 v17, vcc, 12, v0
+; SI-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; SI-NEXT:    v_or_b32_e32 v14, v14, v16
+; SI-NEXT:    v_add_i32_e32 v16, vcc, 16, v0
+; SI-NEXT:    buffer_store_dword v14, v16, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v29
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v15
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v35
 ; SI-NEXT:    v_or_b32_e32 v14, v14, v15
-; SI-NEXT:    v_add_i32_e32 v15, vcc, 16, v0
+; SI-NEXT:    v_add_i32_e32 v15, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v14, v15, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v34
-; SI-NEXT:    v_or_b32_e32 v13, v13, v14
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 20, v0
-; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v12
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v25
+; SI-NEXT:    v_or_b32_e32 v12, v12, v14
+; SI-NEXT:    v_add_i32_e32 v14, vcc, 24, v0
+; SI-NEXT:    buffer_store_dword v12, v14, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v13
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v34
 ; SI-NEXT:    v_or_b32_e32 v12, v12, v13
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 24, v0
+; SI-NEXT:    v_add_i32_e32 v13, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v12, v13, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v32
-; SI-NEXT:    v_or_b32_e32 v11, v11, v12
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 28, v0
-; SI-NEXT:    buffer_store_dword v11, v12, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v25
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v24
+; SI-NEXT:    v_or_b32_e32 v10, v10, v12
+; SI-NEXT:    v_add_i32_e32 v12, vcc, 32, v0
+; SI-NEXT:    buffer_store_dword v10, v12, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v33
 ; SI-NEXT:    v_or_b32_e32 v10, v10, v11
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 32, v0
+; SI-NEXT:    v_add_i32_e32 v11, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v30
-; SI-NEXT:    v_or_b32_e32 v9, v9, v10
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 36, v0
-; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v23
+; SI-NEXT:    v_or_b32_e32 v8, v8, v10
+; SI-NEXT:    v_add_i32_e32 v10, vcc, 40, v0
+; SI-NEXT:    buffer_store_dword v8, v10, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v32
 ; SI-NEXT:    v_or_b32_e32 v8, v8, v9
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 40, v0
+; SI-NEXT:    v_add_i32_e32 v9, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v8, v9, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v28
-; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 44, v0
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v21
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v22
+; SI-NEXT:    v_or_b32_e32 v6, v6, v8
+; SI-NEXT:    v_add_i32_e32 v8, vcc, 48, v0
+; SI-NEXT:    buffer_store_dword v6, v8, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v31
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 48, v0
+; SI-NEXT:    v_add_i32_e32 v7, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v26
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 52, v0
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v20
-; SI-NEXT:    v_or_b32_e32 v1, v1, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 56, v0
-; SI-NEXT:    buffer_store_dword v1, v5, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v21
+; SI-NEXT:    v_or_b32_e32 v1, v1, v6
+; SI-NEXT:    v_add_i32_e32 v6, vcc, 56, v0
+; SI-NEXT:    buffer_store_dword v1, v6, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v24
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v30
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v20
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v23
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB29_4:
-; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr28
+; SI-NEXT:    ; implicit-def: $vgpr37
+; SI-NEXT:    ; implicit-def: $vgpr27
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr26
 ; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr29
+; SI-NEXT:    ; implicit-def: $vgpr25
 ; SI-NEXT:    ; implicit-def: $vgpr34
-; SI-NEXT:    ; implicit-def: $vgpr27
+; SI-NEXT:    ; implicit-def: $vgpr24
+; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr23
 ; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr25
+; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    ; implicit-def: $vgpr30
+; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr28
 ; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr26
 ; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $vgpr17
-; SI-NEXT:    ; implicit-def: $vgpr23
 ; SI-NEXT:    s_branch .LBB29_2
 ;
 ; VI-LABEL: bitcast_v18f32_to_v36i16_scalar:
@@ -15972,216 +15970,214 @@ define inreg <36 x i16> @bitcast_v9i64_to_v36i16_scalar(<9 x i64> inreg %a, i32
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; SI-NEXT:    v_readfirstlane_b32 s9, v1
-; SI-NEXT:    v_readfirstlane_b32 s8, v2
-; SI-NEXT:    v_readfirstlane_b32 s7, v3
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_readfirstlane_b32 s6, v4
+; SI-NEXT:    v_readfirstlane_b32 s6, v1
+; SI-NEXT:    v_readfirstlane_b32 s7, v2
+; SI-NEXT:    v_readfirstlane_b32 s4, v3
+; SI-NEXT:    s_and_b64 s[8:9], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s5, v4
 ; SI-NEXT:    s_cbranch_scc0 .LBB41_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s28
-; SI-NEXT:    v_mov_b32_e32 v4, s26
-; SI-NEXT:    v_mov_b32_e32 v5, s24
-; SI-NEXT:    v_mov_b32_e32 v6, s22
-; SI-NEXT:    v_mov_b32_e32 v7, s20
-; SI-NEXT:    v_mov_b32_e32 v8, s18
-; SI-NEXT:    v_mov_b32_e32 v9, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s29, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s27, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s25, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s23, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s21, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s19, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s17, v9, 16
-; SI-NEXT:    s_lshr_b32 s10, s6, 16
-; SI-NEXT:    s_lshr_b32 s11, s8, 16
-; SI-NEXT:    s_lshr_b32 s12, s29, 16
-; SI-NEXT:    s_lshr_b32 s13, s27, 16
-; SI-NEXT:    s_lshr_b32 s14, s25, 16
-; SI-NEXT:    s_lshr_b32 s15, s23, 16
-; SI-NEXT:    s_lshr_b32 s40, s21, 16
-; SI-NEXT:    s_lshr_b32 s41, s19, 16
-; SI-NEXT:    s_lshr_b32 s42, s17, 16
+; SI-NEXT:    s_lshr_b32 s60, s5, 16
+; SI-NEXT:    s_lshr_b32 s61, s7, 16
+; SI-NEXT:    s_lshr_b32 s62, s29, 16
+; SI-NEXT:    s_lshr_b32 s63, s27, 16
+; SI-NEXT:    s_lshr_b32 s72, s25, 16
+; SI-NEXT:    s_lshr_b32 s73, s23, 16
+; SI-NEXT:    s_lshr_b32 s74, s21, 16
+; SI-NEXT:    s_lshr_b32 s75, s19, 16
+; SI-NEXT:    s_lshr_b32 s76, s17, 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB41_3
 ; SI-NEXT:  .LBB41_2: ; %cmp.true
-; SI-NEXT:    s_add_u32 s16, s16, 3
-; SI-NEXT:    s_addc_u32 s17, s17, 0
-; SI-NEXT:    s_add_u32 s18, s18, 3
-; SI-NEXT:    s_addc_u32 s19, s19, 0
-; SI-NEXT:    s_add_u32 s20, s20, 3
-; SI-NEXT:    s_addc_u32 s21, s21, 0
-; SI-NEXT:    s_add_u32 s22, s22, 3
-; SI-NEXT:    s_addc_u32 s23, s23, 0
-; SI-NEXT:    s_add_u32 s24, s24, 3
-; SI-NEXT:    s_addc_u32 s25, s25, 0
-; SI-NEXT:    s_add_u32 s26, s26, 3
-; SI-NEXT:    s_addc_u32 s27, s27, 0
+; SI-NEXT:    s_add_u32 s4, s4, 3
+; SI-NEXT:    s_addc_u32 s5, s5, 0
+; SI-NEXT:    s_add_u32 s6, s6, 3
+; SI-NEXT:    s_addc_u32 s7, s7, 0
 ; SI-NEXT:    s_add_u32 s28, s28, 3
 ; SI-NEXT:    s_addc_u32 s29, s29, 0
-; SI-NEXT:    s_add_u32 s9, s9, 3
-; SI-NEXT:    s_addc_u32 s8, s8, 0
-; SI-NEXT:    s_add_u32 s7, s7, 3
-; SI-NEXT:    s_addc_u32 s6, s6, 0
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s28
-; SI-NEXT:    v_mov_b32_e32 v4, s26
-; SI-NEXT:    v_mov_b32_e32 v5, s24
-; SI-NEXT:    v_mov_b32_e32 v6, s22
-; SI-NEXT:    v_mov_b32_e32 v7, s20
-; SI-NEXT:    v_mov_b32_e32 v8, s18
-; SI-NEXT:    v_mov_b32_e32 v9, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s29, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s27, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s25, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s23, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s21, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s19, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s17, v9, 16
-; SI-NEXT:    s_lshr_b32 s10, s6, 16
-; SI-NEXT:    s_lshr_b32 s11, s8, 16
-; SI-NEXT:    s_lshr_b32 s12, s29, 16
-; SI-NEXT:    s_lshr_b32 s13, s27, 16
-; SI-NEXT:    s_lshr_b32 s14, s25, 16
-; SI-NEXT:    s_lshr_b32 s15, s23, 16
-; SI-NEXT:    s_lshr_b32 s40, s21, 16
-; SI-NEXT:    s_lshr_b32 s41, s19, 16
-; SI-NEXT:    s_lshr_b32 s42, s17, 16
+; SI-NEXT:    s_add_u32 s26, s26, 3
+; SI-NEXT:    s_addc_u32 s27, s27, 0
+; SI-NEXT:    s_add_u32 s24, s24, 3
+; SI-NEXT:    s_addc_u32 s25, s25, 0
+; SI-NEXT:    s_add_u32 s22, s22, 3
+; SI-NEXT:    s_addc_u32 s23, s23, 0
+; SI-NEXT:    s_add_u32 s20, s20, 3
+; SI-NEXT:    s_addc_u32 s21, s21, 0
+; SI-NEXT:    s_add_u32 s18, s18, 3
+; SI-NEXT:    s_addc_u32 s19, s19, 0
+; SI-NEXT:    s_add_u32 s16, s16, 3
+; SI-NEXT:    s_addc_u32 s17, s17, 0
+; SI-NEXT:    s_lshr_b32 s60, s5, 16
+; SI-NEXT:    s_lshr_b32 s61, s7, 16
+; SI-NEXT:    s_lshr_b32 s62, s29, 16
+; SI-NEXT:    s_lshr_b32 s63, s27, 16
+; SI-NEXT:    s_lshr_b32 s72, s25, 16
+; SI-NEXT:    s_lshr_b32 s73, s23, 16
+; SI-NEXT:    s_lshr_b32 s74, s21, 16
+; SI-NEXT:    s_lshr_b32 s75, s19, 16
+; SI-NEXT:    s_lshr_b32 s76, s17, 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[16:17], 16
 ; SI-NEXT:  .LBB41_3: ; %end
-; SI-NEXT:    s_and_b32 s4, s16, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_or_b32_e32 v9, s4, v9
-; SI-NEXT:    s_and_b32 s4, s17, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s42, 16
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_mov_b32_e32 v10, s4
-; SI-NEXT:    s_and_b32 s4, s18, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; SI-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 4, v0
-; SI-NEXT:    v_or_b32_e32 v8, s4, v8
-; SI-NEXT:    s_and_b32 s4, s19, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s41, 16
-; SI-NEXT:    buffer_store_dword v10, v9, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 8, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v8, v9, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v9, s4
-; SI-NEXT:    s_and_b32 s4, s20, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    s_lshl_b32 s9, s56, 16
+; SI-NEXT:    s_and_b32 s11, s16, 0xffff
+; SI-NEXT:    s_or_b32 s9, s11, s9
+; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    s_and_b32 s9, s17, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s76, 16
+; SI-NEXT:    s_or_b32 s9, s9, s11
+; SI-NEXT:    v_mov_b32_e32 v2, s9
+; SI-NEXT:    s_and_b32 s9, s18, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s46, 16
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 12, v0
-; SI-NEXT:    v_or_b32_e32 v7, s4, v7
-; SI-NEXT:    s_and_b32 s4, s21, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s40, 16
-; SI-NEXT:    buffer_store_dword v9, v8, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 16, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v8, s4
-; SI-NEXT:    s_and_b32 s4, s22, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; SI-NEXT:    s_or_b32 s9, s9, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 20, v0
-; SI-NEXT:    v_or_b32_e32 v6, s4, v6
-; SI-NEXT:    s_and_b32 s4, s23, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s15, 16
-; SI-NEXT:    buffer_store_dword v8, v7, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 24, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    s_and_b32 s4, s24, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 28, v0
-; SI-NEXT:    v_or_b32_e32 v5, s4, v5
-; SI-NEXT:    s_and_b32 s4, s25, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s14, 16
-; SI-NEXT:    buffer_store_dword v7, v6, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 32, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v6, s4
-; SI-NEXT:    s_and_b32 s4, s26, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_mov_b32_e32 v2, s9
+; SI-NEXT:    s_and_b32 s9, s19, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s75, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; SI-NEXT:    s_or_b32 s9, s9, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 36, v0
-; SI-NEXT:    v_or_b32_e32 v4, s4, v4
-; SI-NEXT:    s_and_b32 s4, s27, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s13, 16
-; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 40, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v5, s4
-; SI-NEXT:    s_and_b32 s4, s28, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_mov_b32_e32 v2, s9
+; SI-NEXT:    s_and_b32 s9, s20, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s44, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; SI-NEXT:    s_or_b32 s9, s9, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 44, v0
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s29, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s12, 16
-; SI-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v4, s4
-; SI-NEXT:    s_and_b32 s4, s9, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_mov_b32_e32 v2, s9
+; SI-NEXT:    s_and_b32 s9, s21, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s74, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; SI-NEXT:    s_or_b32 s9, s9, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
-; SI-NEXT:    v_or_b32_e32 v2, s4, v2
-; SI-NEXT:    s_and_b32 s4, s8, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s11, 16
-; SI-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v3, s4
-; SI-NEXT:    s_and_b32 s4, s7, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_mov_b32_e32 v2, s9
+; SI-NEXT:    s_and_b32 s9, s22, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s42, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 20, v0
+; SI-NEXT:    s_or_b32 s9, s9, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
-; SI-NEXT:    v_or_b32_e32 v1, s4, v1
-; SI-NEXT:    s_and_b32 s4, s6, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s10, 16
-; SI-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s9
+; SI-NEXT:    s_and_b32 s9, s23, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s73, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 24, v0
+; SI-NEXT:    s_or_b32 s9, s9, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s9
+; SI-NEXT:    s_and_b32 s9, s24, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s40, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s9, s9, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s9
+; SI-NEXT:    s_and_b32 s9, s25, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s72, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v0
+; SI-NEXT:    s_or_b32 s9, s9, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s9
+; SI-NEXT:    s_and_b32 s9, s26, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s14, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 36, v0
+; SI-NEXT:    s_or_b32 s9, s9, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s9
+; SI-NEXT:    s_and_b32 s9, s27, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s63, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 40, v0
+; SI-NEXT:    s_or_b32 s9, s9, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s9
+; SI-NEXT:    s_and_b32 s9, s28, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s12, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 44, v0
+; SI-NEXT:    s_or_b32 s9, s9, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s9
+; SI-NEXT:    s_and_b32 s9, s29, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s62, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 48, v0
+; SI-NEXT:    s_or_b32 s9, s9, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s9
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_lshl_b32 s9, s10, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 52, v0
+; SI-NEXT:    s_or_b32 s6, s6, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s6, s7, 0xffff
+; SI-NEXT:    s_lshl_b32 s7, s61, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 56, v0
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; SI-NEXT:    s_lshl_b32 s6, s8, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 60, v0
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    s_and_b32 s4, s5, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s60, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 64, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x44, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB41_4:
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $sgpr56
+; SI-NEXT:    ; implicit-def: $sgpr76
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr75
+; SI-NEXT:    ; implicit-def: $sgpr44
+; SI-NEXT:    ; implicit-def: $sgpr74
 ; SI-NEXT:    ; implicit-def: $sgpr42
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $sgpr41
-; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $sgpr73
 ; SI-NEXT:    ; implicit-def: $sgpr40
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $sgpr15
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $sgpr72
 ; SI-NEXT:    ; implicit-def: $sgpr14
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $sgpr13
-; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $sgpr63
 ; SI-NEXT:    ; implicit-def: $sgpr12
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $sgpr11
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr62
 ; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr61
+; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr60
 ; SI-NEXT:    s_branch .LBB41_2
 ;
 ; VI-LABEL: bitcast_v9i64_to_v36i16_scalar:
@@ -21460,97 +21456,97 @@ define inreg <36 x i16> @bitcast_v9f64_to_v36i16_scalar(<9 x double> inreg %a, i
 ; SI-NEXT:    v_mov_b32_e32 v7, s29
 ; SI-NEXT:    s_cbranch_scc0 .LBB49_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_alignbit_b32 v5, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v20, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v21, v7, v6, 16
-; SI-NEXT:    v_alignbit_b32 v22, v9, v8, 16
-; SI-NEXT:    v_alignbit_b32 v24, v11, v10, 16
-; SI-NEXT:    v_alignbit_b32 v26, v13, v12, 16
-; SI-NEXT:    v_alignbit_b32 v29, v15, v14, 16
-; SI-NEXT:    v_alignbit_b32 v31, v17, v16, 16
-; SI-NEXT:    v_alignbit_b32 v34, v19, v18, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v19
+; SI-NEXT:    v_lshr_b64 v[20:21], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[21:22], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[22:23], v[6:7], 16
+; SI-NEXT:    v_lshr_b64 v[23:24], v[8:9], 16
+; SI-NEXT:    v_lshr_b64 v[24:25], v[10:11], 16
+; SI-NEXT:    v_lshr_b64 v[25:26], v[12:13], 16
+; SI-NEXT:    v_lshr_b64 v[26:27], v[14:15], 16
+; SI-NEXT:    v_lshr_b64 v[27:28], v[16:17], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v19
+; SI-NEXT:    v_lshr_b64 v[28:29], v[18:19], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB49_3
 ; SI-NEXT:  .LBB49_2: ; %cmp.true
-; SI-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
-; SI-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
-; SI-NEXT:    v_add_f64 v[14:15], v[14:15], 1.0
-; SI-NEXT:    v_add_f64 v[12:13], v[12:13], 1.0
-; SI-NEXT:    v_add_f64 v[10:11], v[10:11], 1.0
-; SI-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
-; SI-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
 ; SI-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
 ; SI-NEXT:    v_add_f64 v[1:2], v[1:2], 1.0
-; SI-NEXT:    v_alignbit_b32 v5, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v20, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v21, v7, v6, 16
-; SI-NEXT:    v_alignbit_b32 v22, v9, v8, 16
-; SI-NEXT:    v_alignbit_b32 v24, v11, v10, 16
-; SI-NEXT:    v_alignbit_b32 v26, v13, v12, 16
-; SI-NEXT:    v_alignbit_b32 v29, v15, v14, 16
-; SI-NEXT:    v_alignbit_b32 v31, v17, v16, 16
-; SI-NEXT:    v_alignbit_b32 v34, v19, v18, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v23, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v19
+; SI-NEXT:    v_add_f64 v[6:7], v[6:7], 1.0
+; SI-NEXT:    v_lshr_b64 v[20:21], v[3:4], 16
+; SI-NEXT:    v_add_f64 v[8:9], v[8:9], 1.0
+; SI-NEXT:    v_lshr_b64 v[21:22], v[1:2], 16
+; SI-NEXT:    v_add_f64 v[10:11], v[10:11], 1.0
+; SI-NEXT:    v_lshr_b64 v[22:23], v[6:7], 16
+; SI-NEXT:    v_add_f64 v[12:13], v[12:13], 1.0
+; SI-NEXT:    v_lshr_b64 v[23:24], v[8:9], 16
+; SI-NEXT:    v_add_f64 v[14:15], v[14:15], 1.0
+; SI-NEXT:    v_lshr_b64 v[24:25], v[10:11], 16
+; SI-NEXT:    v_add_f64 v[16:17], v[16:17], 1.0
+; SI-NEXT:    v_lshr_b64 v[25:26], v[12:13], 16
+; SI-NEXT:    v_add_f64 v[18:19], v[18:19], 1.0
+; SI-NEXT:    v_lshr_b64 v[26:27], v[14:15], 16
+; SI-NEXT:    v_lshr_b64 v[27:28], v[16:17], 16
+; SI-NEXT:    v_lshr_b64 v[28:29], v[18:19], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v9
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v13
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v19
 ; SI-NEXT:  .LBB49_3: ; %end
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
 ; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; SI-NEXT:    v_or_b32_e32 v18, v18, v34
+; SI-NEXT:    v_or_b32_e32 v18, v18, v28
 ; SI-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v19
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v36
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v37
 ; SI-NEXT:    v_or_b32_e32 v18, v18, v19
 ; SI-NEXT:    v_add_i32_e32 v19, vcc, 4, v0
 ; SI-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v31
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v27
+; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v16
 ; SI-NEXT:    v_or_b32_e32 v16, v16, v18
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 8, v0
 ; SI-NEXT:    buffer_store_dword v16, v18, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v35
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v36
 ; SI-NEXT:    v_or_b32_e32 v16, v16, v17
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, 12, v0
 ; SI-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v14
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v29
+; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v26
+; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v14
 ; SI-NEXT:    v_or_b32_e32 v14, v14, v16
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 16, v0
 ; SI-NEXT:    buffer_store_dword v14, v16, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v33
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v35
 ; SI-NEXT:    v_or_b32_e32 v14, v14, v15
 ; SI-NEXT:    v_add_i32_e32 v15, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v14, v15, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v12
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v26
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v25
 ; SI-NEXT:    v_or_b32_e32 v12, v12, v14
 ; SI-NEXT:    v_add_i32_e32 v14, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v12, v14, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v32
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v34
 ; SI-NEXT:    v_or_b32_e32 v12, v12, v13
 ; SI-NEXT:    v_add_i32_e32 v13, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v12, v13, s[0:3], 0 offen
@@ -21562,79 +21558,79 @@ define inreg <36 x i16> @bitcast_v9f64_to_v36i16_scalar(<9 x double> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v10, v12, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v30
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v33
 ; SI-NEXT:    v_or_b32_e32 v10, v10, v11
 ; SI-NEXT:    v_add_i32_e32 v11, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v23
 ; SI-NEXT:    v_or_b32_e32 v8, v8, v10
 ; SI-NEXT:    v_add_i32_e32 v10, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v8, v10, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v28
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v32
 ; SI-NEXT:    v_or_b32_e32 v8, v8, v9
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v8, v9, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v21
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v22
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v8
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, 48, v0
 ; SI-NEXT:    buffer_store_dword v6, v8, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v31
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v7
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v20
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v21
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v6
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, 56, v0
 ; SI-NEXT:    buffer_store_dword v1, v6, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v25
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v30
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v20
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v23
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB49_4:
-; SI-NEXT:    ; implicit-def: $vgpr34
+; SI-NEXT:    ; implicit-def: $vgpr28
+; SI-NEXT:    ; implicit-def: $vgpr37
+; SI-NEXT:    ; implicit-def: $vgpr27
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr26
 ; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr29
+; SI-NEXT:    ; implicit-def: $vgpr25
+; SI-NEXT:    ; implicit-def: $vgpr34
+; SI-NEXT:    ; implicit-def: $vgpr24
 ; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; implicit-def: $vgpr26
+; SI-NEXT:    ; implicit-def: $vgpr23
 ; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr24
+; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    ; implicit-def: $vgpr30
+; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr28
 ; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr27
 ; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr5
-; SI-NEXT:    ; implicit-def: $vgpr23
 ; SI-NEXT:    s_branch .LBB49_2
 ;
 ; VI-LABEL: bitcast_v9f64_to_v36i16_scalar:
@@ -28520,302 +28516,321 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i
 ; SI-LABEL: bitcast_v36f16_to_v36i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v23
-; SI-NEXT:    v_cvt_f16_f32_e32 v27, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v28, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v29, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v24, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v26, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v31, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v23, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v30, v12
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v49, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v51, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v48, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v37, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v38, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, v22
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v45, s16
+; SI-NEXT:    v_cvt_f16_f32_e32 v44, s19
+; SI-NEXT:    v_cvt_f16_f32_e32 v43, s20
+; SI-NEXT:    v_cvt_f16_f32_e32 v41, s23
+; SI-NEXT:    v_cvt_f16_f32_e32 v42, s24
+; SI-NEXT:    v_cvt_f16_f32_e32 v55, s27
+; SI-NEXT:    v_cvt_f16_f32_e32 v40, s28
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v26, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v22, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v20
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v22, s16
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, s17
-; SI-NEXT:    v_cvt_f16_f32_e32 v19, s18
-; SI-NEXT:    v_cvt_f16_f32_e32 v20, s19
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, s21
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, s22
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, s23
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, s24
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, s25
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, s26
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, s27
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, s28
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, s29
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v21, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, s22
+; SI-NEXT:    v_cvt_f16_f32_e32 v30, s25
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, s26
+; SI-NEXT:    v_cvt_f16_f32_e32 v28, s29
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v23
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_cbranch_scc0 .LBB59_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_cbranch_execnz .LBB59_3
 ; SI-NEXT:  .LBB59_2: ; %cmp.true
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v44
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v21
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v34
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
-; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v44, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
-; SI-NEXT:    v_cvt_f32_f16_e32 v28, v28
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v44
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_or_b32_e32 v21, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v41
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v32
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v41, v5
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
-; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT:    v_add_f32_e32 v23, 0x38000000, v23
-; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v25
-; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
-; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v24
-; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
-; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; SI-NEXT:    v_cvt_f32_f16_e32 v36, v36
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v35
-; SI-NEXT:    v_cvt_f32_f16_e32 v34, v34
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, v33
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v30, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    v_or_b32_e32 v1, v1, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
-; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
-; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
-; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; SI-NEXT:    v_cvt_f32_f16_e32 v29, v29
-; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
-; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
-; SI-NEXT:    v_or_b32_e32 v6, v6, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
-; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
-; SI-NEXT:    v_or_b32_e32 v23, v23, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v25
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
-; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
-; SI-NEXT:    v_add_f32_e32 v36, 0x38000000, v36
-; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v35
-; SI-NEXT:    v_add_f32_e32 v34, 0x38000000, v34
-; SI-NEXT:    v_add_f32_e32 v33, 0x38000000, v33
-; SI-NEXT:    v_add_f32_e32 v32, 0x38000000, v32
-; SI-NEXT:    v_add_f32_e32 v31, 0x38000000, v31
-; SI-NEXT:    v_add_f32_e32 v30, 0x38000000, v30
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v41
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_or_b32_e32 v18, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v55
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v30
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v55, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v55
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_or_b32_e32 v16, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v52
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v28
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v52
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_or_b32_e32 v14, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v50
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v26
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v50
+; SI-NEXT:    v_or_b32_e32 v10, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v49
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v24
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v49, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v49
+; SI-NEXT:    v_or_b32_e32 v8, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v39
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v22
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v39
+; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
+; SI-NEXT:    v_or_b32_e32 v6, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v37
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v19
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v37, v4
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v37
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_or_b32_e32 v4, v3, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v36
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v43
+; SI-NEXT:    v_cvt_f32_f16_e32 v19, v42
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v38
+; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, v11
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
-; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
-; SI-NEXT:    v_or_b32_e32 v24, v24, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v36
+; SI-NEXT:    v_or_b32_e32 v2, v2, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v45
+; SI-NEXT:    v_or_b32_e32 v43, v12, v17
+; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v19
+; SI-NEXT:    v_cvt_f32_f16_e32 v19, v54
+; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v36
-; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
-; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v34
-; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v33
-; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
-; SI-NEXT:    v_add_f32_e32 v29, 0x38000000, v29
-; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT:    v_add_f32_e32 v26, 0x38000000, v26
-; SI-NEXT:    v_cvt_f16_f32_e32 v30, v30
+; SI-NEXT:    v_or_b32_e32 v45, v11, v20
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v40
+; SI-NEXT:    v_or_b32_e32 v42, v12, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v53
+; SI-NEXT:    v_or_b32_e32 v54, v19, v9
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v19, v48
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
+; SI-NEXT:    v_or_b32_e32 v40, v11, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v51
+; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
+; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT:    v_or_b32_e32 v27, v27, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v29, v29
-; SI-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT:    v_or_b32_e32 v13, v13, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v17
-; SI-NEXT:    v_or_b32_e32 v16, v16, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v20
-; SI-NEXT:    v_or_b32_e32 v19, v19, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
-; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v35
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v34
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v33
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
-; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
-; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_or_b32_e32 v22, v22, v36
-; SI-NEXT:    v_or_b32_e32 v21, v21, v35
-; SI-NEXT:    v_or_b32_e32 v18, v18, v34
-; SI-NEXT:    v_or_b32_e32 v15, v15, v33
-; SI-NEXT:    v_or_b32_e32 v29, v29, v32
-; SI-NEXT:    v_or_b32_e32 v26, v26, v31
-; SI-NEXT:    v_or_b32_e32 v11, v11, v30
-; SI-NEXT:    v_or_b32_e32 v8, v8, v12
-; SI-NEXT:    v_or_b32_e32 v5, v5, v9
-; SI-NEXT:    v_alignbit_b32 v36, v19, v36, 16
-; SI-NEXT:    v_alignbit_b32 v35, v16, v35, 16
-; SI-NEXT:    v_alignbit_b32 v34, v13, v34, 16
-; SI-NEXT:    v_alignbit_b32 v33, v27, v33, 16
-; SI-NEXT:    v_alignbit_b32 v32, v24, v32, 16
-; SI-NEXT:    v_alignbit_b32 v31, v23, v31, 16
-; SI-NEXT:    v_alignbit_b32 v30, v6, v30, 16
-; SI-NEXT:    v_alignbit_b32 v12, v3, v12, 16
-; SI-NEXT:    v_alignbit_b32 v9, v1, v9, 16
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v53, v12, v7
+; SI-NEXT:    v_or_b32_e32 v51, v11, v5
+; SI-NEXT:    v_or_b32_e32 v48, v19, v3
+; SI-NEXT:    v_or_b32_e32 v38, v22, v1
+; SI-NEXT:    v_lshr_b64 v[34:35], v[20:21], 16
+; SI-NEXT:    v_lshr_b64 v[32:33], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[30:31], v[15:16], 16
+; SI-NEXT:    v_lshr_b64 v[28:29], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[26:27], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[24:25], v[7:8], 16
+; SI-NEXT:    v_lshr_b64 v[22:23], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[19:20], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[11:12], v[1:2], 16
 ; SI-NEXT:  .LBB59_3: ; %end
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT:    v_or_b32_e32 v22, v22, v36
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 4, v0
-; SI-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v21
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v35
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 8, v0
-; SI-NEXT:    v_or_b32_e32 v16, v16, v17
-; SI-NEXT:    v_add_i32_e32 v17, vcc, 12, v0
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v34
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v45
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v34
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT:    v_or_b32_e32 v16, v16, v17
-; SI-NEXT:    v_add_i32_e32 v17, vcc, 16, v0
-; SI-NEXT:    v_or_b32_e32 v13, v13, v14
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 20, v0
-; SI-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v21
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v44
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 4, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v33
-; SI-NEXT:    v_or_b32_e32 v13, v13, v14
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 24, v0
-; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v32
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v43
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 8, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v27
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v28
-; SI-NEXT:    v_or_b32_e32 v13, v13, v14
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 28, v0
-; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v41
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v32
-; SI-NEXT:    v_or_b32_e32 v13, v13, v14
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 32, v0
-; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v30
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v42
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v25
-; SI-NEXT:    v_or_b32_e32 v13, v13, v14
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 36, v0
-; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v55
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v31
-; SI-NEXT:    v_or_b32_e32 v13, v13, v14
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 40, v0
-; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v28
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v40
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v23
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT:    v_or_b32_e32 v10, v13, v10
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 44, v0
-; SI-NEXT:    buffer_store_dword v10, v13, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v52
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v30
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_or_b32_e32 v10, v10, v11
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 48, v0
-; SI-NEXT:    v_or_b32_e32 v6, v6, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 52, v0
-; SI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v54
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v26
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v12
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_or_b32_e32 v6, v6, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 56, v0
-; SI-NEXT:    v_or_b32_e32 v3, v3, v4
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 60, v0
-; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v50
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 36, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v3, v3, v4
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 64, v0
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v53
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v24
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 40, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v49
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 44, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v51
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v22
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v39
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v48
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v19
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v37
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v38
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v11
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 64, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v36
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x44, v0
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB59_4:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
index b8091d8256457..47cb6bd3b3bb6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll
@@ -2838,240 +2838,238 @@ define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i3
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; SI-NEXT:    v_readfirstlane_b32 s11, v1
-; SI-NEXT:    v_readfirstlane_b32 s10, v2
-; SI-NEXT:    v_readfirstlane_b32 s9, v3
-; SI-NEXT:    v_readfirstlane_b32 s8, v4
-; SI-NEXT:    v_readfirstlane_b32 s7, v5
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_readfirstlane_b32 s6, v6
+; SI-NEXT:    v_readfirstlane_b32 s8, v1
+; SI-NEXT:    v_readfirstlane_b32 s9, v2
+; SI-NEXT:    v_readfirstlane_b32 s6, v3
+; SI-NEXT:    v_readfirstlane_b32 s7, v4
+; SI-NEXT:    v_readfirstlane_b32 s4, v5
+; SI-NEXT:    s_and_b64 s[10:11], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s5, v6
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s28
-; SI-NEXT:    v_mov_b32_e32 v5, s26
-; SI-NEXT:    v_mov_b32_e32 v6, s24
-; SI-NEXT:    v_mov_b32_e32 v7, s22
-; SI-NEXT:    v_mov_b32_e32 v8, s20
-; SI-NEXT:    v_mov_b32_e32 v9, s18
-; SI-NEXT:    v_mov_b32_e32 v10, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s10, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s29, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s27, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s25, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s23, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s21, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s19, v9, 16
-; SI-NEXT:    v_alignbit_b32 v10, s17, v10, 16
-; SI-NEXT:    s_lshr_b32 s12, s6, 16
-; SI-NEXT:    s_lshr_b32 s13, s8, 16
-; SI-NEXT:    s_lshr_b32 s14, s10, 16
-; SI-NEXT:    s_lshr_b32 s15, s29, 16
-; SI-NEXT:    s_lshr_b32 s40, s27, 16
-; SI-NEXT:    s_lshr_b32 s41, s25, 16
-; SI-NEXT:    s_lshr_b32 s42, s23, 16
-; SI-NEXT:    s_lshr_b32 s43, s21, 16
-; SI-NEXT:    s_lshr_b32 s44, s19, 16
-; SI-NEXT:    s_lshr_b32 s45, s17, 16
+; SI-NEXT:    s_lshr_b32 s72, s5, 16
+; SI-NEXT:    s_lshr_b32 s73, s7, 16
+; SI-NEXT:    s_lshr_b32 s74, s9, 16
+; SI-NEXT:    s_lshr_b32 s75, s29, 16
+; SI-NEXT:    s_lshr_b32 s76, s27, 16
+; SI-NEXT:    s_lshr_b32 s77, s25, 16
+; SI-NEXT:    s_lshr_b32 s78, s23, 16
+; SI-NEXT:    s_lshr_b32 s79, s21, 16
+; SI-NEXT:    s_lshr_b32 s88, s19, 16
+; SI-NEXT:    s_lshr_b32 s89, s17, 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB13_3
 ; SI-NEXT:  .LBB13_2: ; %cmp.true
+; SI-NEXT:    s_add_i32 s17, s17, 3
 ; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    s_add_i32 s19, s19, 3
 ; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_add_i32 s21, s21, 3
 ; SI-NEXT:    s_add_i32 s20, s20, 3
+; SI-NEXT:    s_add_i32 s23, s23, 3
 ; SI-NEXT:    s_add_i32 s22, s22, 3
+; SI-NEXT:    s_add_i32 s25, s25, 3
 ; SI-NEXT:    s_add_i32 s24, s24, 3
+; SI-NEXT:    s_add_i32 s27, s27, 3
 ; SI-NEXT:    s_add_i32 s26, s26, 3
+; SI-NEXT:    s_add_i32 s29, s29, 3
 ; SI-NEXT:    s_add_i32 s28, s28, 3
-; SI-NEXT:    s_add_i32 s11, s11, 3
 ; SI-NEXT:    s_add_i32 s9, s9, 3
-; SI-NEXT:    s_add_i32 s7, s7, 3
-; SI-NEXT:    s_add_i32 s17, s17, 3
-; SI-NEXT:    s_add_i32 s19, s19, 3
-; SI-NEXT:    s_add_i32 s21, s21, 3
-; SI-NEXT:    s_add_i32 s23, s23, 3
-; SI-NEXT:    s_add_i32 s25, s25, 3
-; SI-NEXT:    s_add_i32 s27, s27, 3
-; SI-NEXT:    s_add_i32 s29, s29, 3
-; SI-NEXT:    s_add_i32 s10, s10, 3
 ; SI-NEXT:    s_add_i32 s8, s8, 3
+; SI-NEXT:    s_add_i32 s7, s7, 3
 ; SI-NEXT:    s_add_i32 s6, s6, 3
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s28
-; SI-NEXT:    v_mov_b32_e32 v5, s26
-; SI-NEXT:    v_mov_b32_e32 v6, s24
-; SI-NEXT:    v_mov_b32_e32 v7, s22
-; SI-NEXT:    v_mov_b32_e32 v8, s20
-; SI-NEXT:    v_mov_b32_e32 v9, s18
-; SI-NEXT:    v_mov_b32_e32 v10, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s10, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s29, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s27, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s25, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s23, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s21, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s19, v9, 16
-; SI-NEXT:    v_alignbit_b32 v10, s17, v10, 16
-; SI-NEXT:    s_lshr_b32 s12, s6, 16
-; SI-NEXT:    s_lshr_b32 s13, s8, 16
-; SI-NEXT:    s_lshr_b32 s14, s10, 16
-; SI-NEXT:    s_lshr_b32 s15, s29, 16
-; SI-NEXT:    s_lshr_b32 s40, s27, 16
-; SI-NEXT:    s_lshr_b32 s41, s25, 16
-; SI-NEXT:    s_lshr_b32 s42, s23, 16
-; SI-NEXT:    s_lshr_b32 s43, s21, 16
-; SI-NEXT:    s_lshr_b32 s44, s19, 16
-; SI-NEXT:    s_lshr_b32 s45, s17, 16
+; SI-NEXT:    s_add_i32 s5, s5, 3
+; SI-NEXT:    s_add_i32 s4, s4, 3
+; SI-NEXT:    s_lshr_b64 s[10:11], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[22:23], 16
+; SI-NEXT:    s_lshr_b32 s72, s5, 16
+; SI-NEXT:    s_lshr_b32 s73, s7, 16
+; SI-NEXT:    s_lshr_b32 s74, s9, 16
+; SI-NEXT:    s_lshr_b32 s75, s29, 16
+; SI-NEXT:    s_lshr_b32 s76, s27, 16
+; SI-NEXT:    s_lshr_b32 s77, s25, 16
+; SI-NEXT:    s_lshr_b32 s78, s23, 16
+; SI-NEXT:    s_lshr_b32 s79, s21, 16
+; SI-NEXT:    s_lshr_b32 s88, s19, 16
+; SI-NEXT:    s_lshr_b32 s89, s17, 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[16:17], 16
 ; SI-NEXT:  .LBB13_3: ; %end
-; SI-NEXT:    s_and_b32 s4, s16, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT:    v_or_b32_e32 v10, s4, v10
-; SI-NEXT:    s_and_b32 s4, s17, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s45, 16
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_mov_b32_e32 v11, s4
-; SI-NEXT:    s_and_b32 s4, s18, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 4, v0
-; SI-NEXT:    v_or_b32_e32 v9, s4, v9
-; SI-NEXT:    s_and_b32 s4, s19, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s44, 16
-; SI-NEXT:    buffer_store_dword v11, v10, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 8, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v10, s4
-; SI-NEXT:    s_and_b32 s4, s20, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT:    s_lshl_b32 s11, s60, 16
+; SI-NEXT:    s_and_b32 s13, s16, 0xffff
+; SI-NEXT:    s_or_b32 s11, s13, s11
+; SI-NEXT:    v_mov_b32_e32 v1, s11
+; SI-NEXT:    s_and_b32 s11, s17, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s89, 16
+; SI-NEXT:    s_or_b32 s11, s11, s13
+; SI-NEXT:    v_mov_b32_e32 v2, s11
+; SI-NEXT:    s_lshl_b32 s11, s58, 16
+; SI-NEXT:    s_and_b32 s13, s18, 0xffff
+; SI-NEXT:    s_or_b32 s11, s13, s11
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 12, v0
-; SI-NEXT:    v_or_b32_e32 v8, s4, v8
-; SI-NEXT:    s_and_b32 s4, s21, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s43, 16
-; SI-NEXT:    buffer_store_dword v10, v9, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 16, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v8, v9, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v9, s4
-; SI-NEXT:    s_and_b32 s4, s22, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 20, v0
-; SI-NEXT:    v_or_b32_e32 v7, s4, v7
-; SI-NEXT:    s_and_b32 s4, s23, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s42, 16
-; SI-NEXT:    buffer_store_dword v9, v8, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 24, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v8, s4
-; SI-NEXT:    s_and_b32 s4, s24, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT:    v_mov_b32_e32 v2, s11
+; SI-NEXT:    s_and_b32 s11, s19, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s88, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; SI-NEXT:    s_or_b32 s11, s11, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 28, v0
-; SI-NEXT:    v_or_b32_e32 v6, s4, v6
-; SI-NEXT:    s_and_b32 s4, s25, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s41, 16
-; SI-NEXT:    buffer_store_dword v8, v7, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 32, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    s_and_b32 s4, s26, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_mov_b32_e32 v2, s11
+; SI-NEXT:    s_and_b32 s11, s20, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s56, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; SI-NEXT:    s_or_b32 s11, s11, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 36, v0
-; SI-NEXT:    v_or_b32_e32 v5, s4, v5
-; SI-NEXT:    s_and_b32 s4, s27, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s40, 16
-; SI-NEXT:    buffer_store_dword v7, v6, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 40, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v6, s4
-; SI-NEXT:    s_and_b32 s4, s28, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_mov_b32_e32 v2, s11
+; SI-NEXT:    s_and_b32 s11, s21, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s79, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; SI-NEXT:    s_or_b32 s11, s11, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 44, v0
-; SI-NEXT:    v_or_b32_e32 v4, s4, v4
-; SI-NEXT:    s_and_b32 s4, s29, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s15, 16
-; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v5, s4
-; SI-NEXT:    s_and_b32 s4, s11, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_mov_b32_e32 v2, s11
+; SI-NEXT:    s_and_b32 s11, s22, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s46, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 20, v0
+; SI-NEXT:    s_or_b32 s11, s11, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 52, v0
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s10, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s14, 16
-; SI-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 56, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v4, s4
-; SI-NEXT:    s_and_b32 s4, s9, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_mov_b32_e32 v2, s11
+; SI-NEXT:    s_and_b32 s11, s23, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s78, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 24, v0
+; SI-NEXT:    s_or_b32 s11, s11, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
-; SI-NEXT:    v_or_b32_e32 v2, s4, v2
-; SI-NEXT:    s_and_b32 s4, s8, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s13, 16
-; SI-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 64, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v3, s4
-; SI-NEXT:    s_and_b32 s4, s7, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_mov_b32_e32 v2, s11
+; SI-NEXT:    s_and_b32 s11, s24, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s44, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s11, s11, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
-; SI-NEXT:    v_or_b32_e32 v1, s4, v1
-; SI-NEXT:    s_and_b32 s4, s6, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s12, 16
-; SI-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x48, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s11
+; SI-NEXT:    s_and_b32 s11, s25, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s77, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v0
+; SI-NEXT:    s_or_b32 s11, s11, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s11
+; SI-NEXT:    s_and_b32 s11, s26, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s42, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 36, v0
+; SI-NEXT:    s_or_b32 s11, s11, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s11
+; SI-NEXT:    s_and_b32 s11, s27, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s76, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 40, v0
+; SI-NEXT:    s_or_b32 s11, s11, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s11
+; SI-NEXT:    s_and_b32 s11, s28, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s40, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 44, v0
+; SI-NEXT:    s_or_b32 s11, s11, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s11
+; SI-NEXT:    s_and_b32 s11, s29, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s75, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 48, v0
+; SI-NEXT:    s_or_b32 s11, s11, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s11
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s14, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 52, v0
+; SI-NEXT:    s_or_b32 s8, s8, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s8, s9, 0xffff
+; SI-NEXT:    s_lshl_b32 s9, s74, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 56, v0
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_lshl_b32 s8, s12, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 60, v0
+; SI-NEXT:    s_or_b32 s6, s6, s8
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s6, s7, 0xffff
+; SI-NEXT:    s_lshl_b32 s7, s73, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 64, v0
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; SI-NEXT:    s_lshl_b32 s6, s10, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x44, v0
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    s_and_b32 s4, s5, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s72, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x48, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x4c, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB13_4:
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $sgpr45
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr89
+; SI-NEXT:    ; implicit-def: $sgpr58
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    ; implicit-def: $sgpr56
+; SI-NEXT:    ; implicit-def: $sgpr79
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr78
 ; SI-NEXT:    ; implicit-def: $sgpr44
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $sgpr43
-; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $sgpr77
 ; SI-NEXT:    ; implicit-def: $sgpr42
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $sgpr41
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $sgpr76
 ; SI-NEXT:    ; implicit-def: $sgpr40
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $sgpr15
-; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $sgpr75
 ; SI-NEXT:    ; implicit-def: $sgpr14
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $sgpr13
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr74
 ; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr73
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr72
 ; SI-NEXT:    s_branch .LBB13_2
 ;
 ; VI-LABEL: bitcast_v20i32_to_v40i16_scalar:
@@ -10571,165 +10569,165 @@ define inreg <40 x i16> @bitcast_v20f32_to_v40i16_scalar(<20 x float> inreg %a,
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; SI-NEXT:    v_mov_b32_e32 v21, s16
+; SI-NEXT:    v_mov_b32_e32 v19, s16
 ; SI-NEXT:    v_mov_b32_e32 v20, s17
-; SI-NEXT:    v_mov_b32_e32 v19, s18
-; SI-NEXT:    v_mov_b32_e32 v17, s19
-; SI-NEXT:    v_mov_b32_e32 v16, s20
-; SI-NEXT:    v_mov_b32_e32 v15, s21
-; SI-NEXT:    v_mov_b32_e32 v14, s22
-; SI-NEXT:    v_mov_b32_e32 v13, s23
-; SI-NEXT:    v_mov_b32_e32 v12, s24
-; SI-NEXT:    v_mov_b32_e32 v11, s25
-; SI-NEXT:    v_mov_b32_e32 v10, s26
-; SI-NEXT:    v_mov_b32_e32 v9, s27
-; SI-NEXT:    v_mov_b32_e32 v8, s28
+; SI-NEXT:    v_mov_b32_e32 v17, s18
+; SI-NEXT:    v_mov_b32_e32 v18, s19
+; SI-NEXT:    v_mov_b32_e32 v15, s20
+; SI-NEXT:    v_mov_b32_e32 v16, s21
+; SI-NEXT:    v_mov_b32_e32 v13, s22
+; SI-NEXT:    v_mov_b32_e32 v14, s23
+; SI-NEXT:    v_mov_b32_e32 v11, s24
+; SI-NEXT:    v_mov_b32_e32 v12, s25
+; SI-NEXT:    v_mov_b32_e32 v9, s26
+; SI-NEXT:    v_mov_b32_e32 v10, s27
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mov_b32_e32 v7, s29
+; SI-NEXT:    v_mov_b32_e32 v7, s28
+; SI-NEXT:    v_mov_b32_e32 v8, s29
 ; SI-NEXT:    s_cbranch_scc0 .LBB29_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_alignbit_b32 v18, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v22, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v23, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v24, v7, v8, 16
-; SI-NEXT:    v_alignbit_b32 v26, v9, v10, 16
-; SI-NEXT:    v_alignbit_b32 v28, v11, v12, 16
-; SI-NEXT:    v_alignbit_b32 v31, v13, v14, 16
-; SI-NEXT:    v_alignbit_b32 v33, v15, v16, 16
-; SI-NEXT:    v_alignbit_b32 v35, v17, v19, 16
-; SI-NEXT:    v_alignbit_b32 v37, v20, v21, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v20
+; SI-NEXT:    v_lshr_b64 v[21:22], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[27:28], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[22:23], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[28:29], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[23:24], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[29:30], v[15:16], 16
+; SI-NEXT:    v_lshr_b64 v[24:25], v[7:8], 16
+; SI-NEXT:    v_lshr_b64 v[30:31], v[17:18], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v20
+; SI-NEXT:    v_lshr_b64 v[25:26], v[11:12], 16
+; SI-NEXT:    v_lshr_b64 v[31:32], v[19:20], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB29_3
 ; SI-NEXT:  .LBB29_2: ; %cmp.true
-; SI-NEXT:    v_add_f32_e32 v20, 1.0, v20
-; SI-NEXT:    v_add_f32_e32 v21, 1.0, v21
-; SI-NEXT:    v_add_f32_e32 v17, 1.0, v17
-; SI-NEXT:    v_add_f32_e32 v19, 1.0, v19
-; SI-NEXT:    v_add_f32_e32 v15, 1.0, v15
-; SI-NEXT:    v_add_f32_e32 v16, 1.0, v16
-; SI-NEXT:    v_add_f32_e32 v13, 1.0, v13
-; SI-NEXT:    v_add_f32_e32 v14, 1.0, v14
-; SI-NEXT:    v_add_f32_e32 v11, 1.0, v11
-; SI-NEXT:    v_add_f32_e32 v12, 1.0, v12
-; SI-NEXT:    v_add_f32_e32 v9, 1.0, v9
 ; SI-NEXT:    v_add_f32_e32 v10, 1.0, v10
-; SI-NEXT:    v_add_f32_e32 v7, 1.0, v7
-; SI-NEXT:    v_add_f32_e32 v8, 1.0, v8
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_add_f32_e32 v4, 1.0, v4
-; SI-NEXT:    v_add_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_add_f32_e32 v9, 1.0, v9
 ; SI-NEXT:    v_add_f32_e32 v6, 1.0, v6
 ; SI-NEXT:    v_add_f32_e32 v5, 1.0, v5
-; SI-NEXT:    v_alignbit_b32 v18, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v22, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v23, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v24, v7, v8, 16
-; SI-NEXT:    v_alignbit_b32 v26, v9, v10, 16
-; SI-NEXT:    v_alignbit_b32 v28, v11, v12, 16
-; SI-NEXT:    v_alignbit_b32 v31, v13, v14, 16
-; SI-NEXT:    v_alignbit_b32 v33, v15, v16, 16
-; SI-NEXT:    v_alignbit_b32 v35, v17, v19, 16
-; SI-NEXT:    v_alignbit_b32 v37, v20, v21, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v20
+; SI-NEXT:    v_add_f32_e32 v14, 1.0, v14
+; SI-NEXT:    v_add_f32_e32 v13, 1.0, v13
+; SI-NEXT:    v_add_f32_e32 v4, 1.0, v4
+; SI-NEXT:    v_add_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_lshr_b64 v[21:22], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[27:28], v[9:10], 16
+; SI-NEXT:    v_add_f32_e32 v16, 1.0, v16
+; SI-NEXT:    v_add_f32_e32 v15, 1.0, v15
+; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[22:23], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[28:29], v[13:14], 16
+; SI-NEXT:    v_add_f32_e32 v18, 1.0, v18
+; SI-NEXT:    v_add_f32_e32 v17, 1.0, v17
+; SI-NEXT:    v_add_f32_e32 v8, 1.0, v8
+; SI-NEXT:    v_add_f32_e32 v7, 1.0, v7
+; SI-NEXT:    v_lshr_b64 v[23:24], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[29:30], v[15:16], 16
+; SI-NEXT:    v_add_f32_e32 v20, 1.0, v20
+; SI-NEXT:    v_add_f32_e32 v19, 1.0, v19
+; SI-NEXT:    v_add_f32_e32 v12, 1.0, v12
+; SI-NEXT:    v_add_f32_e32 v11, 1.0, v11
+; SI-NEXT:    v_lshr_b64 v[24:25], v[7:8], 16
+; SI-NEXT:    v_lshr_b64 v[30:31], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[25:26], v[11:12], 16
+; SI-NEXT:    v_lshr_b64 v[31:32], v[19:20], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v20
 ; SI-NEXT:  .LBB29_3: ; %end
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v37
-; SI-NEXT:    v_or_b32_e32 v21, v21, v37
-; SI-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v48
-; SI-NEXT:    v_or_b32_e32 v20, v20, v21
-; SI-NEXT:    v_add_i32_e32 v21, vcc, 4, v0
-; SI-NEXT:    buffer_store_dword v20, v21, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v31
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
+; SI-NEXT:    v_or_b32_e32 v19, v19, v26
+; SI-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v35
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v20
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v50
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 8, v0
+; SI-NEXT:    v_add_i32_e32 v20, vcc, 4, v0
 ; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v39
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v30
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
 ; SI-NEXT:    v_or_b32_e32 v17, v17, v19
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 12, v0
+; SI-NEXT:    v_add_i32_e32 v19, vcc, 8, v0
 ; SI-NEXT:    buffer_store_dword v17, v19, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v16
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v33
-; SI-NEXT:    v_or_b32_e32 v16, v16, v17
-; SI-NEXT:    v_add_i32_e32 v17, vcc, 16, v0
-; SI-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v49
+; SI-NEXT:    v_or_b32_e32 v17, v17, v18
+; SI-NEXT:    v_add_i32_e32 v18, vcc, 12, v0
+; SI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v38
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v29
+; SI-NEXT:    v_or_b32_e32 v15, v15, v17
+; SI-NEXT:    v_add_i32_e32 v17, vcc, 16, v0
+; SI-NEXT:    buffer_store_dword v15, v17, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v48
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v16
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v31
-; SI-NEXT:    v_or_b32_e32 v14, v14, v15
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v28
+; SI-NEXT:    v_or_b32_e32 v13, v13, v15
 ; SI-NEXT:    v_add_i32_e32 v15, vcc, 24, v0
-; SI-NEXT:    buffer_store_dword v14, v15, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; SI-NEXT:    buffer_store_dword v13, v15, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v36
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v39
 ; SI-NEXT:    v_or_b32_e32 v13, v13, v14
 ; SI-NEXT:    v_add_i32_e32 v14, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v28
-; SI-NEXT:    v_or_b32_e32 v12, v12, v13
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v25
+; SI-NEXT:    v_or_b32_e32 v11, v11, v13
 ; SI-NEXT:    v_add_i32_e32 v13, vcc, 32, v0
-; SI-NEXT:    buffer_store_dword v12, v13, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT:    buffer_store_dword v11, v13, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v34
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v12
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v38
 ; SI-NEXT:    v_or_b32_e32 v11, v11, v12
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v11, v12, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v26
-; SI-NEXT:    v_or_b32_e32 v10, v10, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v27
+; SI-NEXT:    v_or_b32_e32 v9, v9, v11
 ; SI-NEXT:    v_add_i32_e32 v11, vcc, 40, v0
-; SI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; SI-NEXT:    buffer_store_dword v9, v11, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v32
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v37
 ; SI-NEXT:    v_or_b32_e32 v9, v9, v10
 ; SI-NEXT:    v_add_i32_e32 v10, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v24
-; SI-NEXT:    v_or_b32_e32 v8, v8, v9
+; SI-NEXT:    v_or_b32_e32 v7, v7, v9
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 48, v0
-; SI-NEXT:    buffer_store_dword v8, v9, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
+; SI-NEXT:    buffer_store_dword v7, v9, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v30
+; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v36
 ; SI-NEXT:    v_or_b32_e32 v7, v7, v8
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
@@ -10741,7 +10739,7 @@ define inreg <40 x i16> @bitcast_v20f32_to_v40i16_scalar(<20 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v7, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v29
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v35
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -10753,45 +10751,45 @@ define inreg <40 x i16> @bitcast_v20f32_to_v40i16_scalar(<20 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v34
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v21
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x48, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v25
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v33
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB29_4:
-; SI-NEXT:    ; implicit-def: $vgpr37
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr50
+; SI-NEXT:    ; implicit-def: $vgpr30
+; SI-NEXT:    ; implicit-def: $vgpr49
+; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    ; implicit-def: $vgpr28
 ; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr25
 ; SI-NEXT:    ; implicit-def: $vgpr38
-; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr28
+; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr34
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr27
 ; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr18
-; SI-NEXT:    ; implicit-def: $vgpr25
+; SI-NEXT:    ; implicit-def: $vgpr21
 ; SI-NEXT:    s_branch .LBB29_2
 ;
 ; VI-LABEL: bitcast_v20f32_to_v40i16_scalar:
@@ -17582,240 +17580,238 @@ define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i3
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; SI-NEXT:    v_readfirstlane_b32 s11, v1
-; SI-NEXT:    v_readfirstlane_b32 s10, v2
-; SI-NEXT:    v_readfirstlane_b32 s9, v3
-; SI-NEXT:    v_readfirstlane_b32 s8, v4
-; SI-NEXT:    v_readfirstlane_b32 s7, v5
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_readfirstlane_b32 s6, v6
+; SI-NEXT:    v_readfirstlane_b32 s8, v1
+; SI-NEXT:    v_readfirstlane_b32 s9, v2
+; SI-NEXT:    v_readfirstlane_b32 s6, v3
+; SI-NEXT:    v_readfirstlane_b32 s7, v4
+; SI-NEXT:    v_readfirstlane_b32 s4, v5
+; SI-NEXT:    s_and_b64 s[10:11], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s5, v6
 ; SI-NEXT:    s_cbranch_scc0 .LBB41_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s28
-; SI-NEXT:    v_mov_b32_e32 v5, s26
-; SI-NEXT:    v_mov_b32_e32 v6, s24
-; SI-NEXT:    v_mov_b32_e32 v7, s22
-; SI-NEXT:    v_mov_b32_e32 v8, s20
-; SI-NEXT:    v_mov_b32_e32 v9, s18
-; SI-NEXT:    v_mov_b32_e32 v10, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s10, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s29, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s27, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s25, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s23, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s21, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s19, v9, 16
-; SI-NEXT:    v_alignbit_b32 v10, s17, v10, 16
-; SI-NEXT:    s_lshr_b32 s12, s6, 16
-; SI-NEXT:    s_lshr_b32 s13, s8, 16
-; SI-NEXT:    s_lshr_b32 s14, s10, 16
-; SI-NEXT:    s_lshr_b32 s15, s29, 16
-; SI-NEXT:    s_lshr_b32 s40, s27, 16
-; SI-NEXT:    s_lshr_b32 s41, s25, 16
-; SI-NEXT:    s_lshr_b32 s42, s23, 16
-; SI-NEXT:    s_lshr_b32 s43, s21, 16
-; SI-NEXT:    s_lshr_b32 s44, s19, 16
-; SI-NEXT:    s_lshr_b32 s45, s17, 16
+; SI-NEXT:    s_lshr_b32 s72, s5, 16
+; SI-NEXT:    s_lshr_b32 s73, s7, 16
+; SI-NEXT:    s_lshr_b32 s74, s9, 16
+; SI-NEXT:    s_lshr_b32 s75, s29, 16
+; SI-NEXT:    s_lshr_b32 s76, s27, 16
+; SI-NEXT:    s_lshr_b32 s77, s25, 16
+; SI-NEXT:    s_lshr_b32 s78, s23, 16
+; SI-NEXT:    s_lshr_b32 s79, s21, 16
+; SI-NEXT:    s_lshr_b32 s88, s19, 16
+; SI-NEXT:    s_lshr_b32 s89, s17, 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB41_3
 ; SI-NEXT:  .LBB41_2: ; %cmp.true
-; SI-NEXT:    s_add_u32 s16, s16, 3
-; SI-NEXT:    s_addc_u32 s17, s17, 0
-; SI-NEXT:    s_add_u32 s18, s18, 3
-; SI-NEXT:    s_addc_u32 s19, s19, 0
-; SI-NEXT:    s_add_u32 s20, s20, 3
-; SI-NEXT:    s_addc_u32 s21, s21, 0
-; SI-NEXT:    s_add_u32 s22, s22, 3
-; SI-NEXT:    s_addc_u32 s23, s23, 0
-; SI-NEXT:    s_add_u32 s24, s24, 3
-; SI-NEXT:    s_addc_u32 s25, s25, 0
-; SI-NEXT:    s_add_u32 s26, s26, 3
-; SI-NEXT:    s_addc_u32 s27, s27, 0
-; SI-NEXT:    s_add_u32 s28, s28, 3
+; SI-NEXT:    s_add_u32 s4, s4, 3
+; SI-NEXT:    s_addc_u32 s5, s5, 0
+; SI-NEXT:    s_add_u32 s6, s6, 3
+; SI-NEXT:    s_addc_u32 s7, s7, 0
+; SI-NEXT:    s_add_u32 s8, s8, 3
+; SI-NEXT:    s_addc_u32 s9, s9, 0
+; SI-NEXT:    s_add_u32 s28, s28, 3
 ; SI-NEXT:    s_addc_u32 s29, s29, 0
-; SI-NEXT:    s_add_u32 s11, s11, 3
-; SI-NEXT:    s_addc_u32 s10, s10, 0
-; SI-NEXT:    s_add_u32 s9, s9, 3
-; SI-NEXT:    s_addc_u32 s8, s8, 0
-; SI-NEXT:    s_add_u32 s7, s7, 3
-; SI-NEXT:    s_addc_u32 s6, s6, 0
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s28
-; SI-NEXT:    v_mov_b32_e32 v5, s26
-; SI-NEXT:    v_mov_b32_e32 v6, s24
-; SI-NEXT:    v_mov_b32_e32 v7, s22
-; SI-NEXT:    v_mov_b32_e32 v8, s20
-; SI-NEXT:    v_mov_b32_e32 v9, s18
-; SI-NEXT:    v_mov_b32_e32 v10, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s10, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s29, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s27, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s25, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s23, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s21, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s19, v9, 16
-; SI-NEXT:    v_alignbit_b32 v10, s17, v10, 16
-; SI-NEXT:    s_lshr_b32 s12, s6, 16
-; SI-NEXT:    s_lshr_b32 s13, s8, 16
-; SI-NEXT:    s_lshr_b32 s14, s10, 16
-; SI-NEXT:    s_lshr_b32 s15, s29, 16
-; SI-NEXT:    s_lshr_b32 s40, s27, 16
-; SI-NEXT:    s_lshr_b32 s41, s25, 16
-; SI-NEXT:    s_lshr_b32 s42, s23, 16
-; SI-NEXT:    s_lshr_b32 s43, s21, 16
-; SI-NEXT:    s_lshr_b32 s44, s19, 16
-; SI-NEXT:    s_lshr_b32 s45, s17, 16
+; SI-NEXT:    s_add_u32 s26, s26, 3
+; SI-NEXT:    s_addc_u32 s27, s27, 0
+; SI-NEXT:    s_add_u32 s24, s24, 3
+; SI-NEXT:    s_addc_u32 s25, s25, 0
+; SI-NEXT:    s_add_u32 s22, s22, 3
+; SI-NEXT:    s_addc_u32 s23, s23, 0
+; SI-NEXT:    s_add_u32 s20, s20, 3
+; SI-NEXT:    s_addc_u32 s21, s21, 0
+; SI-NEXT:    s_add_u32 s18, s18, 3
+; SI-NEXT:    s_addc_u32 s19, s19, 0
+; SI-NEXT:    s_add_u32 s16, s16, 3
+; SI-NEXT:    s_addc_u32 s17, s17, 0
+; SI-NEXT:    s_lshr_b32 s72, s5, 16
+; SI-NEXT:    s_lshr_b32 s73, s7, 16
+; SI-NEXT:    s_lshr_b32 s74, s9, 16
+; SI-NEXT:    s_lshr_b32 s75, s29, 16
+; SI-NEXT:    s_lshr_b32 s76, s27, 16
+; SI-NEXT:    s_lshr_b32 s77, s25, 16
+; SI-NEXT:    s_lshr_b32 s78, s23, 16
+; SI-NEXT:    s_lshr_b32 s79, s21, 16
+; SI-NEXT:    s_lshr_b32 s88, s19, 16
+; SI-NEXT:    s_lshr_b32 s89, s17, 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[16:17], 16
 ; SI-NEXT:  .LBB41_3: ; %end
-; SI-NEXT:    s_and_b32 s4, s16, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT:    v_or_b32_e32 v10, s4, v10
-; SI-NEXT:    s_and_b32 s4, s17, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s45, 16
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_mov_b32_e32 v11, s4
-; SI-NEXT:    s_and_b32 s4, s18, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 4, v0
-; SI-NEXT:    v_or_b32_e32 v9, s4, v9
-; SI-NEXT:    s_and_b32 s4, s19, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s44, 16
-; SI-NEXT:    buffer_store_dword v11, v10, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 8, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v10, s4
-; SI-NEXT:    s_and_b32 s4, s20, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT:    s_lshl_b32 s11, s60, 16
+; SI-NEXT:    s_and_b32 s13, s16, 0xffff
+; SI-NEXT:    s_or_b32 s11, s13, s11
+; SI-NEXT:    v_mov_b32_e32 v1, s11
+; SI-NEXT:    s_and_b32 s11, s17, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s89, 16
+; SI-NEXT:    s_or_b32 s11, s11, s13
+; SI-NEXT:    v_mov_b32_e32 v2, s11
+; SI-NEXT:    s_lshl_b32 s11, s58, 16
+; SI-NEXT:    s_and_b32 s13, s18, 0xffff
+; SI-NEXT:    s_or_b32 s11, s13, s11
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 12, v0
-; SI-NEXT:    v_or_b32_e32 v8, s4, v8
-; SI-NEXT:    s_and_b32 s4, s21, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s43, 16
-; SI-NEXT:    buffer_store_dword v10, v9, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 16, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v8, v9, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v9, s4
-; SI-NEXT:    s_and_b32 s4, s22, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 20, v0
-; SI-NEXT:    v_or_b32_e32 v7, s4, v7
-; SI-NEXT:    s_and_b32 s4, s23, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s42, 16
-; SI-NEXT:    buffer_store_dword v9, v8, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 24, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v8, s4
-; SI-NEXT:    s_and_b32 s4, s24, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT:    v_mov_b32_e32 v2, s11
+; SI-NEXT:    s_and_b32 s11, s19, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s88, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; SI-NEXT:    s_or_b32 s11, s11, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 28, v0
-; SI-NEXT:    v_or_b32_e32 v6, s4, v6
-; SI-NEXT:    s_and_b32 s4, s25, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s41, 16
-; SI-NEXT:    buffer_store_dword v8, v7, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 32, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    s_and_b32 s4, s26, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_mov_b32_e32 v2, s11
+; SI-NEXT:    s_and_b32 s11, s20, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s56, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; SI-NEXT:    s_or_b32 s11, s11, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 36, v0
-; SI-NEXT:    v_or_b32_e32 v5, s4, v5
-; SI-NEXT:    s_and_b32 s4, s27, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s40, 16
-; SI-NEXT:    buffer_store_dword v7, v6, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 40, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v6, s4
-; SI-NEXT:    s_and_b32 s4, s28, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_mov_b32_e32 v2, s11
+; SI-NEXT:    s_and_b32 s11, s21, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s79, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; SI-NEXT:    s_or_b32 s11, s11, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 44, v0
-; SI-NEXT:    v_or_b32_e32 v4, s4, v4
-; SI-NEXT:    s_and_b32 s4, s29, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s15, 16
-; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v5, s4
-; SI-NEXT:    s_and_b32 s4, s11, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_mov_b32_e32 v2, s11
+; SI-NEXT:    s_and_b32 s11, s22, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s46, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 20, v0
+; SI-NEXT:    s_or_b32 s11, s11, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 52, v0
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s10, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s14, 16
-; SI-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 56, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v4, s4
-; SI-NEXT:    s_and_b32 s4, s9, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_mov_b32_e32 v2, s11
+; SI-NEXT:    s_and_b32 s11, s23, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s78, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 24, v0
+; SI-NEXT:    s_or_b32 s11, s11, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
-; SI-NEXT:    v_or_b32_e32 v2, s4, v2
-; SI-NEXT:    s_and_b32 s4, s8, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s13, 16
-; SI-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 64, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v3, s4
-; SI-NEXT:    s_and_b32 s4, s7, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_mov_b32_e32 v2, s11
+; SI-NEXT:    s_and_b32 s11, s24, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s44, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s11, s11, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
-; SI-NEXT:    v_or_b32_e32 v1, s4, v1
-; SI-NEXT:    s_and_b32 s4, s6, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s12, 16
-; SI-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x48, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s11
+; SI-NEXT:    s_and_b32 s11, s25, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s77, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v0
+; SI-NEXT:    s_or_b32 s11, s11, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s11
+; SI-NEXT:    s_and_b32 s11, s26, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s42, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 36, v0
+; SI-NEXT:    s_or_b32 s11, s11, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s11
+; SI-NEXT:    s_and_b32 s11, s27, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s76, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 40, v0
+; SI-NEXT:    s_or_b32 s11, s11, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s11
+; SI-NEXT:    s_and_b32 s11, s28, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s40, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 44, v0
+; SI-NEXT:    s_or_b32 s11, s11, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s11
+; SI-NEXT:    s_and_b32 s11, s29, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s75, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 48, v0
+; SI-NEXT:    s_or_b32 s11, s11, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s11
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s14, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 52, v0
+; SI-NEXT:    s_or_b32 s8, s8, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s8, s9, 0xffff
+; SI-NEXT:    s_lshl_b32 s9, s74, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 56, v0
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_lshl_b32 s8, s12, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 60, v0
+; SI-NEXT:    s_or_b32 s6, s6, s8
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s6, s7, 0xffff
+; SI-NEXT:    s_lshl_b32 s7, s73, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 64, v0
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; SI-NEXT:    s_lshl_b32 s6, s10, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x44, v0
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    s_and_b32 s4, s5, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s72, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x48, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x4c, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB41_4:
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $sgpr45
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr89
+; SI-NEXT:    ; implicit-def: $sgpr58
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    ; implicit-def: $sgpr56
+; SI-NEXT:    ; implicit-def: $sgpr79
+; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr78
 ; SI-NEXT:    ; implicit-def: $sgpr44
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $sgpr43
-; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $sgpr77
 ; SI-NEXT:    ; implicit-def: $sgpr42
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $sgpr41
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $sgpr76
 ; SI-NEXT:    ; implicit-def: $sgpr40
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $sgpr15
-; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $sgpr75
 ; SI-NEXT:    ; implicit-def: $sgpr14
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $sgpr13
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr74
 ; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr73
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr72
 ; SI-NEXT:    s_branch .LBB41_2
 ;
 ; VI-LABEL: bitcast_v10i64_to_v40i16_scalar:
@@ -23902,126 +23898,126 @@ define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v8, s29
 ; SI-NEXT:    s_cbranch_scc0 .LBB49_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_alignbit_b32 v21, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v22, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v23, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v24, v8, v7, 16
-; SI-NEXT:    v_alignbit_b32 v26, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v28, v12, v11, 16
-; SI-NEXT:    v_alignbit_b32 v30, v14, v13, 16
-; SI-NEXT:    v_alignbit_b32 v33, v16, v15, 16
-; SI-NEXT:    v_alignbit_b32 v35, v18, v17, 16
-; SI-NEXT:    v_alignbit_b32 v37, v20, v19, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v20
+; SI-NEXT:    v_lshr_b64 v[21:22], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[27:28], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[22:23], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[28:29], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[23:24], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[29:30], v[15:16], 16
+; SI-NEXT:    v_lshr_b64 v[24:25], v[7:8], 16
+; SI-NEXT:    v_lshr_b64 v[30:31], v[17:18], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v20
+; SI-NEXT:    v_lshr_b64 v[25:26], v[11:12], 16
+; SI-NEXT:    v_lshr_b64 v[31:32], v[19:20], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB49_3
 ; SI-NEXT:  .LBB49_2: ; %cmp.true
-; SI-NEXT:    v_add_f64 v[19:20], v[19:20], 1.0
-; SI-NEXT:    v_add_f64 v[17:18], v[17:18], 1.0
-; SI-NEXT:    v_add_f64 v[15:16], v[15:16], 1.0
-; SI-NEXT:    v_add_f64 v[13:14], v[13:14], 1.0
-; SI-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
-; SI-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
-; SI-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
-; SI-NEXT:    v_add_f64 v[1:2], v[1:2], 1.0
 ; SI-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
+; SI-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
 ; SI-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
-; SI-NEXT:    v_alignbit_b32 v21, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v22, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v23, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v24, v8, v7, 16
-; SI-NEXT:    v_alignbit_b32 v26, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v28, v12, v11, 16
-; SI-NEXT:    v_alignbit_b32 v30, v14, v13, 16
-; SI-NEXT:    v_alignbit_b32 v33, v16, v15, 16
-; SI-NEXT:    v_alignbit_b32 v35, v18, v17, 16
-; SI-NEXT:    v_alignbit_b32 v37, v20, v19, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v20
+; SI-NEXT:    v_add_f64 v[13:14], v[13:14], 1.0
+; SI-NEXT:    v_add_f64 v[1:2], v[1:2], 1.0
+; SI-NEXT:    v_add_f64 v[15:16], v[15:16], 1.0
+; SI-NEXT:    v_lshr_b64 v[21:22], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[27:28], v[9:10], 16
+; SI-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
+; SI-NEXT:    v_add_f64 v[17:18], v[17:18], 1.0
+; SI-NEXT:    v_lshr_b64 v[22:23], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[28:29], v[13:14], 16
+; SI-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
+; SI-NEXT:    v_add_f64 v[19:20], v[19:20], 1.0
+; SI-NEXT:    v_lshr_b64 v[23:24], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[29:30], v[15:16], 16
+; SI-NEXT:    v_lshr_b64 v[24:25], v[7:8], 16
+; SI-NEXT:    v_lshr_b64 v[30:31], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[25:26], v[11:12], 16
+; SI-NEXT:    v_lshr_b64 v[31:32], v[19:20], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v20
 ; SI-NEXT:  .LBB49_3: ; %end
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v31
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v37
-; SI-NEXT:    v_or_b32_e32 v19, v19, v37
+; SI-NEXT:    v_or_b32_e32 v19, v19, v26
 ; SI-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v48
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v50
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v20
 ; SI-NEXT:    v_add_i32_e32 v20, vcc, 4, v0
 ; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v35
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v30
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
 ; SI-NEXT:    v_or_b32_e32 v17, v17, v19
 ; SI-NEXT:    v_add_i32_e32 v19, vcc, 8, v0
 ; SI-NEXT:    buffer_store_dword v17, v19, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v39
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v49
 ; SI-NEXT:    v_or_b32_e32 v17, v17, v18
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 12, v0
 ; SI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v33
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v29
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v17
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, 16, v0
 ; SI-NEXT:    buffer_store_dword v15, v17, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v38
+; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v48
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v16
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v30
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v28
 ; SI-NEXT:    v_or_b32_e32 v13, v13, v15
 ; SI-NEXT:    v_add_i32_e32 v15, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v13, v15, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v36
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v39
 ; SI-NEXT:    v_or_b32_e32 v13, v13, v14
 ; SI-NEXT:    v_add_i32_e32 v14, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v28
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v25
 ; SI-NEXT:    v_or_b32_e32 v11, v11, v13
 ; SI-NEXT:    v_add_i32_e32 v13, vcc, 32, v0
 ; SI-NEXT:    buffer_store_dword v11, v13, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v38
 ; SI-NEXT:    v_or_b32_e32 v11, v11, v12
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v11, v12, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v26
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v27
 ; SI-NEXT:    v_or_b32_e32 v9, v9, v11
 ; SI-NEXT:    v_add_i32_e32 v11, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v9, v11, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v32
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v37
 ; SI-NEXT:    v_or_b32_e32 v9, v9, v10
 ; SI-NEXT:    v_add_i32_e32 v10, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
@@ -24033,7 +24029,7 @@ define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v7, v9, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v31
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v36
 ; SI-NEXT:    v_or_b32_e32 v7, v7, v8
 ; SI-NEXT:    v_add_i32_e32 v8, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
@@ -24045,7 +24041,7 @@ define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v7, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v29
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v35
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -24057,7 +24053,7 @@ define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v34
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -24069,33 +24065,33 @@ define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v25
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v33
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB49_4:
-; SI-NEXT:    ; implicit-def: $vgpr37
+; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr50
+; SI-NEXT:    ; implicit-def: $vgpr30
+; SI-NEXT:    ; implicit-def: $vgpr49
+; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    ; implicit-def: $vgpr28
 ; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr25
 ; SI-NEXT:    ; implicit-def: $vgpr38
-; SI-NEXT:    ; implicit-def: $vgpr30
+; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr28
+; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr34
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr27
 ; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr27
 ; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr25
 ; SI-NEXT:    s_branch .LBB49_2
 ;
 ; VI-LABEL: bitcast_v10f64_to_v40i16_scalar:
@@ -32185,338 +32181,367 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i
 ; SI-LABEL: bitcast_v40f16_to_v40i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v31, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v30, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v28, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v29, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v38, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v20
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v23
-; SI-NEXT:    v_cvt_f16_f32_e32 v23, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v25
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v26
-; SI-NEXT:    v_cvt_f16_f32_e32 v22, s16
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, s17
-; SI-NEXT:    v_cvt_f16_f32_e32 v19, s18
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, s19
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v26, s21
-; SI-NEXT:    v_cvt_f16_f32_e32 v20, s22
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, s23
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, s24
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, s25
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, s26
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, s27
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, s28
-; SI-NEXT:    v_cvt_f16_f32_e32 v24, s29
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v42, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v43, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v44, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v41, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v55, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v40, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v51, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v49, v22
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v23
+; SI-NEXT:    v_cvt_f16_f32_e32 v48, v26
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, s16
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v59, s19
+; SI-NEXT:    v_cvt_f16_f32_e32 v57, s20
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, s23
+; SI-NEXT:    v_cvt_f16_f32_e32 v47, s24
+; SI-NEXT:    v_cvt_f16_f32_e32 v46, s27
+; SI-NEXT:    v_cvt_f16_f32_e32 v45, s28
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v30, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v28, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v26, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v23, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v21, v24
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v25
+; SI-NEXT:    v_cvt_f16_f32_e32 v38, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, s22
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, s25
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, s26
+; SI-NEXT:    v_cvt_f16_f32_e32 v25, s29
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v27
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_cbranch_scc0 .LBB59_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_cbranch_execnz .LBB59_3
 ; SI-NEXT:  .LBB59_2: ; %cmp.true
-; SI-NEXT:    v_cvt_f32_f16_e32 v27, v49
-; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
-; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v48
-; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
-; SI-NEXT:    v_add_f32_e32 v26, 0x38000000, v26
-; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, v26
-; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v25
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
-; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v27
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v49
-; SI-NEXT:    v_cvt_f32_f16_e32 v49, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v48
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v37
-; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v49
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, v48
-; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v25
-; SI-NEXT:    v_cvt_f32_f16_e32 v25, v34
-; SI-NEXT:    v_cvt_f32_f16_e32 v34, v39
-; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v49
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v25
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
-; SI-NEXT:    v_add_f32_e32 v34, 0x38000000, v34
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v34
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v38
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v36
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v34
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v25
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v45
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    v_or_b32_e32 v34, v25, v50
-; SI-NEXT:    v_cvt_f32_f16_e32 v25, v38
-; SI-NEXT:    v_lshlrev_b32_e32 v38, 16, v39
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v37
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v25
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v25
-; SI-NEXT:    v_cvt_f32_f16_e32 v25, v36
-; SI-NEXT:    v_cvt_f32_f16_e32 v36, v23
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v1
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v30
+; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v32
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_or_b32_e32 v45, v22, v17
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v25
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v26
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v54
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v28
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v22
+; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
-; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v36
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v57
+; SI-NEXT:    v_or_b32_e32 v54, v22, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v51
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v58
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v22
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_or_b32_e32 v57, v3, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v43
+; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; SI-NEXT:    v_or_b32_e32 v58, v1, v19
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v47
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_or_b32_e32 v43, v3, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v40
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT:    v_or_b32_e32 v47, v1, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v41
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
-; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_or_b32_e32 v40, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_or_b32_e32 v41, v1, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v23
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v52, 16, v25
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v28, v28
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT:    v_or_b32_e32 v5, v5, v25
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v2
-; SI-NEXT:    v_or_b32_e32 v3, v3, v25
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v8
-; SI-NEXT:    v_or_b32_e32 v9, v9, v25
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v7
+; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
 ; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v50
+; SI-NEXT:    v_or_b32_e32 v51, v22, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v48
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
-; SI-NEXT:    v_or_b32_e32 v12, v12, v25
-; SI-NEXT:    v_cvt_f32_f16_e32 v25, v32
-; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, v33
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v35
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v23, v1
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v22
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v49
+; SI-NEXT:    v_cvt_f16_f32_e32 v48, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v21
 ; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v10
-; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v25
-; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
+; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v22
+; SI-NEXT:    v_cvt_f16_f32_e32 v49, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v48
+; SI-NEXT:    v_or_b32_e32 v2, v2, v21
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, v53
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v49
+; SI-NEXT:    v_or_b32_e32 v4, v4, v22
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v52
+; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v21
 ; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; SI-NEXT:    v_or_b32_e32 v28, v28, v32
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v25
-; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v33
-; SI-NEXT:    v_add_f32_e32 v31, 0x38000000, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; SI-NEXT:    v_cvt_f32_f16_e32 v30, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v29, v29
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
-; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v35
-; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v22
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v55
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v21
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v53
+; SI-NEXT:    v_or_b32_e32 v6, v6, v21
+; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v22
+; SI-NEXT:    v_cvt_f16_f32_e32 v55, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v52
+; SI-NEXT:    v_or_b32_e32 v8, v8, v21
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, v44
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v55
+; SI-NEXT:    v_or_b32_e32 v10, v10, v22
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v42
+; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v44, v21
 ; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
-; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
+; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v22
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v42, v21
 ; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v32
-; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
-; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v44
+; SI-NEXT:    v_or_b32_e32 v14, v14, v21
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v42
+; SI-NEXT:    v_or_b32_e32 v18, v18, v21
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, v56
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v46
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
 ; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, v21
+; SI-NEXT:    v_cvt_f32_f16_e32 v21, v59
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
+; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v22
 ; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
-; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v15
-; SI-NEXT:    v_add_f32_e32 v30, 0x38000000, v30
-; SI-NEXT:    v_add_f32_e32 v29, 0x38000000, v29
-; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
-; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
-; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT:    v_or_b32_e32 v33, v25, v33
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v31
-; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v46, v22
+; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v30, v30
-; SI-NEXT:    v_cvt_f16_f32_e32 v29, v29
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-NEXT:    v_or_b32_e32 v35, v35, v25
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v13
-; SI-NEXT:    v_or_b32_e32 v14, v14, v25
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v17
-; SI-NEXT:    v_or_b32_e32 v20, v20, v25
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v51, 16, v39
-; SI-NEXT:    v_or_b32_e32 v19, v19, v25
-; SI-NEXT:    v_or_b32_e32 v22, v22, v26
-; SI-NEXT:    v_or_b32_e32 v21, v21, v27
-; SI-NEXT:    v_or_b32_e32 v16, v16, v24
-; SI-NEXT:    v_or_b32_e32 v15, v15, v48
-; SI-NEXT:    v_or_b32_e32 v30, v30, v38
-; SI-NEXT:    v_or_b32_e32 v29, v29, v37
-; SI-NEXT:    v_or_b32_e32 v11, v11, v51
-; SI-NEXT:    v_or_b32_e32 v6, v6, v23
-; SI-NEXT:    v_or_b32_e32 v4, v4, v52
-; SI-NEXT:    v_alignbit_b32 v49, v19, v26, 16
-; SI-NEXT:    v_alignbit_b32 v26, v20, v27, 16
-; SI-NEXT:    v_alignbit_b32 v25, v14, v24, 16
-; SI-NEXT:    v_alignbit_b32 v24, v35, v48, 16
-; SI-NEXT:    v_alignbit_b32 v48, v33, v50, 16
-; SI-NEXT:    v_alignbit_b32 v39, v28, v38, 16
-; SI-NEXT:    v_alignbit_b32 v38, v12, v37, 16
-; SI-NEXT:    v_alignbit_b32 v37, v9, v51, 16
-; SI-NEXT:    v_alignbit_b32 v36, v3, v23, 16
-; SI-NEXT:    v_alignbit_b32 v23, v5, v52, 16
+; SI-NEXT:    v_cvt_f16_f32_e32 v59, v21
+; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v56
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v46
+; SI-NEXT:    v_or_b32_e32 v16, v16, v21
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v59
+; SI-NEXT:    v_or_b32_e32 v12, v12, v22
+; SI-NEXT:    v_or_b32_e32 v20, v20, v21
+; SI-NEXT:    v_lshr_b64 v[25:26], v[17:18], 16
+; SI-NEXT:    v_or_b32_e32 v50, v23, v1
+; SI-NEXT:    v_lshr_b64 v[38:39], v[19:20], 16
+; SI-NEXT:    v_lshr_b64 v[36:37], v[15:16], 16
+; SI-NEXT:    v_lshr_b64 v[34:35], v[11:12], 16
+; SI-NEXT:    v_lshr_b64 v[32:33], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[30:31], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[28:29], v[7:8], 16
+; SI-NEXT:    v_lshr_b64 v[26:27], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[23:24], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[21:22], v[1:2], 16
 ; SI-NEXT:  .LBB59_3: ; %end
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v49
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; SI-NEXT:    v_or_b32_e32 v22, v22, v27
-; SI-NEXT:    v_or_b32_e32 v18, v19, v18
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 4, v0
-; SI-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v38
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v58
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v21
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v26
-; SI-NEXT:    v_or_b32_e32 v18, v18, v19
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 8, v0
-; SI-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v20
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v59
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 4, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT:    v_or_b32_e32 v17, v18, v17
-; SI-NEXT:    v_add_i32_e32 v18, vcc, 12, v0
-; SI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v36
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v57
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 8, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v25
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_or_b32_e32 v16, v16, v17
-; SI-NEXT:    v_add_i32_e32 v17, vcc, 16, v0
-; SI-NEXT:    v_or_b32_e32 v13, v14, v13
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 20, v0
-; SI-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v56
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v24
-; SI-NEXT:    v_or_b32_e32 v13, v13, v14
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 24, v0
-; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v34
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v47
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v35
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v31
-; SI-NEXT:    v_or_b32_e32 v13, v13, v14
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 28, v0
-; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v12
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v46
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v34
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v48
-; SI-NEXT:    v_or_b32_e32 v13, v13, v14
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 32, v0
-; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v45
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v25
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v33
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v32
-; SI-NEXT:    v_or_b32_e32 v13, v13, v14
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 36, v0
-; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v42
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v30
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v39
-; SI-NEXT:    v_or_b32_e32 v13, v13, v14
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 40, v0
-; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v43
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v32
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT:    v_or_b32_e32 v10, v13, v10
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 44, v0
-; SI-NEXT:    buffer_store_dword v10, v13, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v44
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 36, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v38
-; SI-NEXT:    v_or_b32_e32 v10, v10, v13
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 48, v0
-; SI-NEXT:    buffer_store_dword v10, v13, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v41
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v30
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 40, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_or_b32_e32 v7, v10, v7
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 52, v0
-; SI-NEXT:    buffer_store_dword v7, v10, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v55
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 44, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v37
-; SI-NEXT:    v_or_b32_e32 v7, v7, v10
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 56, v0
-; SI-NEXT:    buffer_store_dword v7, v10, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v54
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v28
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 60, v0
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v52
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v36
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v6, v6, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 64, v0
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x44, v0
-; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v40
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v26
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v4
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v53
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v51
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v23
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 64, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v49
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x44, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v50
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v21
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x48, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v48
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB59_4:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
index ab1f8606cffd7..67c9bfe9d9f3b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
@@ -1065,24 +1065,23 @@ define inreg <4 x i16> @bitcast_i64_to_v4i16_scalar(i64 inreg %a, i32 inreg %b)
 ; SI-NEXT:    s_cmp_lg_u32 s18, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s17, 16
+; SI-NEXT:    s_lshr_b32 s8, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB13_3
 ; SI-NEXT:  .LBB13_2: ; %cmp.true
 ; SI-NEXT:    s_add_u32 s16, s16, 3
 ; SI-NEXT:    s_addc_u32 s17, s17, 0
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s17, 16
+; SI-NEXT:    s_lshr_b32 s8, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[16:17], 16
 ; SI-NEXT:  .LBB13_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
+; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v3, s6
+; SI-NEXT:    v_mov_b32_e32 v3, s8
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB13_4:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr8
 ; SI-NEXT:    s_branch .LBB13_2
 ;
 ; VI-LABEL: bitcast_i64_to_v4i16_scalar:
@@ -2708,38 +2707,39 @@ define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) {
 ; SI-NEXT:    s_cmp_lg_u32 s18, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB25_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v3, s17, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, s17, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 8
-; SI-NEXT:    s_lshr_b32 s6, s17, 24
-; SI-NEXT:    s_lshr_b32 s7, s17, 16
-; SI-NEXT:    s_lshr_b32 s8, s17, 8
+; SI-NEXT:    s_lshr_b32 s12, s17, 24
+; SI-NEXT:    s_lshr_b32 s13, s17, 16
+; SI-NEXT:    s_lshr_b32 s14, s17, 8
+; SI-NEXT:    s_lshr_b64 s[4:5], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[6:7], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[16:17], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB25_3
 ; SI-NEXT:  .LBB25_2: ; %cmp.true
 ; SI-NEXT:    s_add_u32 s16, s16, 3
 ; SI-NEXT:    s_addc_u32 s17, s17, 0
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v3, s17, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, s17, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 8
-; SI-NEXT:    s_lshr_b32 s6, s17, 24
-; SI-NEXT:    s_lshr_b32 s7, s17, 16
-; SI-NEXT:    s_lshr_b32 s8, s17, 8
+; SI-NEXT:    s_lshr_b32 s12, s17, 24
+; SI-NEXT:    s_lshr_b32 s13, s17, 16
+; SI-NEXT:    s_lshr_b32 s14, s17, 8
+; SI-NEXT:    s_lshr_b64 s[4:5], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[6:7], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[16:17], 8
 ; SI-NEXT:  .LBB25_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
+; SI-NEXT:    v_mov_b32_e32 v1, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    v_mov_b32_e32 v3, s4
 ; SI-NEXT:    v_mov_b32_e32 v4, s17
-; SI-NEXT:    v_mov_b32_e32 v5, s8
-; SI-NEXT:    v_mov_b32_e32 v6, s7
-; SI-NEXT:    v_mov_b32_e32 v7, s6
+; SI-NEXT:    v_mov_b32_e32 v5, s14
+; SI-NEXT:    v_mov_b32_e32 v6, s13
+; SI-NEXT:    v_mov_b32_e32 v7, s12
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB25_4:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $sgpr7
 ; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr13
+; SI-NEXT:    ; implicit-def: $sgpr12
 ; SI-NEXT:    s_branch .LBB25_2
 ;
 ; VI-LABEL: bitcast_i64_to_v8i8_scalar:
@@ -4222,23 +4222,23 @@ define inreg <4 x i16> @bitcast_f64_to_v4i16_scalar(double inreg %a, i32 inreg %
 ; SI-NEXT:    s_cmp_lg_u32 s18, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB37_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s17, 16
+; SI-NEXT:    s_lshr_b32 s8, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB37_4
 ; SI-NEXT:  .LBB37_2: ; %cmp.true
 ; SI-NEXT:    v_add_f64 v[4:5], s[16:17], 1.0
-; SI-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[4:5], 16
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
 ; SI-NEXT:    s_branch .LBB37_5
 ; SI-NEXT:  .LBB37_3:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr8
 ; SI-NEXT:    s_branch .LBB37_2
 ; SI-NEXT:  .LBB37_4:
 ; SI-NEXT:    v_mov_b32_e32 v5, s17
 ; SI-NEXT:    v_mov_b32_e32 v4, s16
-; SI-NEXT:    v_mov_b32_e32 v3, s6
+; SI-NEXT:    v_mov_b32_e32 v3, s8
+; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:  .LBB37_5: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, v4
 ; SI-NEXT:    v_mov_b32_e32 v2, v5
@@ -5836,40 +5836,43 @@ define inreg <8 x i8> @bitcast_f64_to_v8i8_scalar(double inreg %a, i32 inreg %b)
 ; SI-NEXT:    s_cmp_lg_u32 s18, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB49_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v3, s17, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, s17, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 8
-; SI-NEXT:    s_lshr_b32 s8, s17, 24
-; SI-NEXT:    s_lshr_b32 s7, s17, 16
-; SI-NEXT:    s_lshr_b32 s6, s17, 8
+; SI-NEXT:    s_lshr_b32 s14, s17, 24
+; SI-NEXT:    s_lshr_b32 s13, s17, 16
+; SI-NEXT:    s_lshr_b32 s12, s17, 8
+; SI-NEXT:    s_lshr_b64 s[4:5], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[6:7], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[16:17], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB49_4
 ; SI-NEXT:  .LBB49_2: ; %cmp.true
-; SI-NEXT:    v_add_f64 v[8:9], s[16:17], 1.0
-; SI-NEXT:    v_alignbit_b32 v3, v9, v8, 24
-; SI-NEXT:    v_alignbit_b32 v2, v9, v8, 16
-; SI-NEXT:    v_alignbit_b32 v1, v9, v8, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
+; SI-NEXT:    v_add_f64 v[10:11], s[16:17], 1.0
+; SI-NEXT:    v_lshr_b64 v[3:4], v[10:11], 24
+; SI-NEXT:    v_lshr_b64 v[8:9], v[10:11], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[10:11], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v11
 ; SI-NEXT:    s_branch .LBB49_5
 ; SI-NEXT:  .LBB49_3:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $sgpr7
 ; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr13
+; SI-NEXT:    ; implicit-def: $sgpr14
 ; SI-NEXT:    s_branch .LBB49_2
 ; SI-NEXT:  .LBB49_4:
-; SI-NEXT:    v_mov_b32_e32 v9, s17
-; SI-NEXT:    v_mov_b32_e32 v8, s16
-; SI-NEXT:    v_mov_b32_e32 v7, s8
-; SI-NEXT:    v_mov_b32_e32 v6, s7
-; SI-NEXT:    v_mov_b32_e32 v5, s6
+; SI-NEXT:    v_mov_b32_e32 v11, s17
+; SI-NEXT:    v_mov_b32_e32 v10, s16
+; SI-NEXT:    v_mov_b32_e32 v7, s14
+; SI-NEXT:    v_mov_b32_e32 v6, s13
+; SI-NEXT:    v_mov_b32_e32 v5, s12
+; SI-NEXT:    v_mov_b32_e32 v1, s8
+; SI-NEXT:    v_mov_b32_e32 v8, s6
+; SI-NEXT:    v_mov_b32_e32 v3, s4
 ; SI-NEXT:  .LBB49_5: ; %end
-; SI-NEXT:    v_mov_b32_e32 v0, v8
-; SI-NEXT:    v_mov_b32_e32 v4, v9
+; SI-NEXT:    v_mov_b32_e32 v0, v10
+; SI-NEXT:    v_mov_b32_e32 v2, v8
+; SI-NEXT:    v_mov_b32_e32 v4, v11
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_f64_to_v8i8_scalar:
@@ -7049,24 +7052,23 @@ define inreg <4 x i16> @bitcast_v2i32_to_v4i16_scalar(<2 x i32> inreg %a, i32 in
 ; SI-NEXT:    s_cmp_lg_u32 s18, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB57_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s17, 16
+; SI-NEXT:    s_lshr_b32 s8, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB57_3
 ; SI-NEXT:  .LBB57_2: ; %cmp.true
-; SI-NEXT:    s_add_i32 s16, s16, 3
 ; SI-NEXT:    s_add_i32 s17, s17, 3
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s17, 16
+; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    s_lshr_b64 s[4:5], s[16:17], 16
+; SI-NEXT:    s_lshr_b32 s8, s17, 16
 ; SI-NEXT:  .LBB57_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
+; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v3, s6
+; SI-NEXT:    v_mov_b32_e32 v3, s8
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB57_4:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr8
 ; SI-NEXT:    s_branch .LBB57_2
 ;
 ; VI-LABEL: bitcast_v2i32_to_v4i16_scalar:
@@ -8688,38 +8690,39 @@ define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inre
 ; SI-NEXT:    s_cmp_lg_u32 s18, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB69_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v3, s17, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, s17, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 8
-; SI-NEXT:    s_lshr_b32 s6, s17, 24
-; SI-NEXT:    s_lshr_b32 s7, s17, 16
-; SI-NEXT:    s_lshr_b32 s8, s17, 8
+; SI-NEXT:    s_lshr_b32 s12, s17, 24
+; SI-NEXT:    s_lshr_b32 s13, s17, 16
+; SI-NEXT:    s_lshr_b32 s14, s17, 8
+; SI-NEXT:    s_lshr_b64 s[4:5], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[6:7], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[16:17], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB69_3
 ; SI-NEXT:  .LBB69_2: ; %cmp.true
-; SI-NEXT:    s_add_i32 s16, s16, 3
 ; SI-NEXT:    s_add_i32 s17, s17, 3
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v3, s17, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, s17, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 8
-; SI-NEXT:    s_lshr_b32 s6, s17, 24
-; SI-NEXT:    s_lshr_b32 s7, s17, 16
-; SI-NEXT:    s_lshr_b32 s8, s17, 8
+; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    s_lshr_b64 s[4:5], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[6:7], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[16:17], 8
+; SI-NEXT:    s_lshr_b32 s12, s17, 24
+; SI-NEXT:    s_lshr_b32 s13, s17, 16
+; SI-NEXT:    s_lshr_b32 s14, s17, 8
 ; SI-NEXT:  .LBB69_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
+; SI-NEXT:    v_mov_b32_e32 v1, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    v_mov_b32_e32 v3, s4
 ; SI-NEXT:    v_mov_b32_e32 v4, s17
-; SI-NEXT:    v_mov_b32_e32 v5, s8
-; SI-NEXT:    v_mov_b32_e32 v6, s7
-; SI-NEXT:    v_mov_b32_e32 v7, s6
+; SI-NEXT:    v_mov_b32_e32 v5, s14
+; SI-NEXT:    v_mov_b32_e32 v6, s13
+; SI-NEXT:    v_mov_b32_e32 v7, s12
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB69_4:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $sgpr7
 ; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr13
+; SI-NEXT:    ; implicit-def: $sgpr12
 ; SI-NEXT:    s_branch .LBB69_2
 ;
 ; VI-LABEL: bitcast_v2i32_to_v8i8_scalar:
@@ -9564,24 +9567,27 @@ define inreg <4 x i16> @bitcast_v2f32_to_v4i16_scalar(<2 x float> inreg %a, i32
 ; SI-NEXT:    s_cmp_lg_u32 s18, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB73_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s17, 16
+; SI-NEXT:    s_lshr_b32 s8, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB73_4
 ; SI-NEXT:  .LBB73_2: ; %cmp.true
-; SI-NEXT:    v_add_f32_e64 v2, s17, 1.0
-; SI-NEXT:    v_add_f32_e64 v0, s16, 1.0
-; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-NEXT:    s_setpc_b64 s[30:31]
+; SI-NEXT:    v_add_f32_e64 v5, s17, 1.0
+; SI-NEXT:    v_add_f32_e64 v4, s16, 1.0
+; SI-NEXT:    v_lshr_b64 v[1:2], v[4:5], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; SI-NEXT:    s_branch .LBB73_5
 ; SI-NEXT:  .LBB73_3:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr8
 ; SI-NEXT:    s_branch .LBB73_2
 ; SI-NEXT:  .LBB73_4:
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v3, s6
+; SI-NEXT:    v_mov_b32_e32 v4, s16
+; SI-NEXT:    v_mov_b32_e32 v5, s17
+; SI-NEXT:    v_mov_b32_e32 v3, s8
+; SI-NEXT:    v_mov_b32_e32 v1, s4
+; SI-NEXT:  .LBB73_5: ; %end
+; SI-NEXT:    v_mov_b32_e32 v0, v4
+; SI-NEXT:    v_mov_b32_e32 v2, v5
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v2f32_to_v4i16_scalar:
@@ -11206,38 +11212,44 @@ define inreg <8 x i8> @bitcast_v2f32_to_v8i8_scalar(<2 x float> inreg %a, i32 in
 ; SI-NEXT:    s_cmp_lg_u32 s18, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB85_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v3, s17, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, s17, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 8
-; SI-NEXT:    s_lshr_b32 s6, s17, 24
-; SI-NEXT:    s_lshr_b32 s7, s17, 16
-; SI-NEXT:    s_lshr_b32 s8, s17, 8
+; SI-NEXT:    s_lshr_b32 s12, s17, 24
+; SI-NEXT:    s_lshr_b32 s13, s17, 16
+; SI-NEXT:    s_lshr_b32 s14, s17, 8
+; SI-NEXT:    s_lshr_b64 s[4:5], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[6:7], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[16:17], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB85_4
 ; SI-NEXT:  .LBB85_2: ; %cmp.true
-; SI-NEXT:    v_add_f32_e64 v4, s17, 1.0
-; SI-NEXT:    v_add_f32_e64 v0, s16, 1.0
-; SI-NEXT:    v_alignbit_b32 v3, v4, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, v4, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, v4, v0, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; SI-NEXT:    s_setpc_b64 s[30:31]
+; SI-NEXT:    v_add_f32_e64 v11, s17, 1.0
+; SI-NEXT:    v_add_f32_e64 v10, s16, 1.0
+; SI-NEXT:    v_lshr_b64 v[3:4], v[10:11], 24
+; SI-NEXT:    v_lshr_b64 v[8:9], v[10:11], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[10:11], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v11
+; SI-NEXT:    s_branch .LBB85_5
 ; SI-NEXT:  .LBB85_3:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $sgpr7
 ; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr13
+; SI-NEXT:    ; implicit-def: $sgpr12
 ; SI-NEXT:    s_branch .LBB85_2
 ; SI-NEXT:  .LBB85_4:
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_mov_b32_e32 v4, s17
-; SI-NEXT:    v_mov_b32_e32 v5, s8
-; SI-NEXT:    v_mov_b32_e32 v6, s7
-; SI-NEXT:    v_mov_b32_e32 v7, s6
+; SI-NEXT:    v_mov_b32_e32 v10, s16
+; SI-NEXT:    v_mov_b32_e32 v11, s17
+; SI-NEXT:    v_mov_b32_e32 v5, s14
+; SI-NEXT:    v_mov_b32_e32 v6, s13
+; SI-NEXT:    v_mov_b32_e32 v7, s12
+; SI-NEXT:    v_mov_b32_e32 v3, s4
+; SI-NEXT:    v_mov_b32_e32 v8, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s8
+; SI-NEXT:  .LBB85_5: ; %end
+; SI-NEXT:    v_mov_b32_e32 v0, v10
+; SI-NEXT:    v_mov_b32_e32 v2, v8
+; SI-NEXT:    v_mov_b32_e32 v4, v11
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v2f32_to_v8i8_scalar:
@@ -12327,7 +12339,7 @@ define inreg <4 x i16> @bitcast_v4f16_to_v4i16_scalar(<4 x half> inreg %a, i32 i
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, s16
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, s17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, s18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, s19
 ; SI-NEXT:    s_cmp_lg_u32 s20, 0
@@ -12336,23 +12348,24 @@ define inreg <4 x i16> @bitcast_v4f16_to_v4i16_scalar(<4 x half> inreg %a, i32 i
 ; SI-NEXT:    s_cbranch_execnz .LBB91_3
 ; SI-NEXT:  .LBB91_2: ; %cmp.true
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v4
+; SI-NEXT:    v_lshr_b64 v[4:5], v[1:2], 16
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
 ; SI-NEXT:  .LBB91_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v1, v4
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB91_4:
 ; SI-NEXT:    s_branch .LBB91_2
@@ -12938,34 +12951,35 @@ define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i3
 ; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s16
 ; SI-NEXT:    v_mul_f32_e64 v6, 1.0, s17
 ; SI-NEXT:    v_mul_f32_e64 v5, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v4, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s19
 ; SI-NEXT:    s_cbranch_scc0 .LBB95_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; SI-NEXT:    s_cbranch_execnz .LBB95_3
 ; SI-NEXT:  .LBB95_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v7
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
-; SI-NEXT:    v_alignbit_b32 v0, v2, v0, 16
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v2
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
-; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v7
 ; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
+; SI-NEXT:    v_lshr_b64 v[4:5], v[1:2], 16
+; SI-NEXT:    v_alignbit_b32 v0, v6, v0, 16
 ; SI-NEXT:  .LBB95_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v1, v4
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB95_4:
 ; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr4
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    s_branch .LBB95_2
@@ -13395,52 +13409,52 @@ define inreg <8 x i8> @bitcast_v4i16_to_v8i8_scalar(<4 x i16> inreg %a, i32 inre
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s17, 16
-; SI-NEXT:    s_or_b32 s6, s4, s5
-; SI-NEXT:    s_and_b32 s4, s18, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s19, 16
-; SI-NEXT:    s_or_b32 s7, s4, s5
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_alignbit_b32 v3, s7, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, s7, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, s7, v0, 8
-; SI-NEXT:    s_lshr_b32 s9, s7, 8
-; SI-NEXT:    s_and_b32 s10, s19, 0xffff
-; SI-NEXT:    s_bfe_u32 s8, s19, 0x80008
+; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    s_and_b32 s5, s18, 0xffff
+; SI-NEXT:    s_lshl_b32 s6, s19, 16
+; SI-NEXT:    s_or_b32 s5, s5, s6
+; SI-NEXT:    s_lshr_b64 s[6:7], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[8:9], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[4:5], 8
+; SI-NEXT:    s_lshr_b32 s9, s5, 8
+; SI-NEXT:    s_and_b32 s11, s19, 0xffff
+; SI-NEXT:    s_bfe_u32 s7, s19, 0x80008
 ; SI-NEXT:    s_cbranch_execnz .LBB97_3
 ; SI-NEXT:  .LBB97_2: ; %cmp.true
 ; SI-NEXT:    s_add_i32 s16, s16, 3
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s17, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s18, s18, 3
-; SI-NEXT:    s_add_i32 s6, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s18, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s19, 16
 ; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s7, s4, 0x30000
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_alignbit_b32 v3, s7, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, s7, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, s7, v0, 8
-; SI-NEXT:    s_lshr_b32 s8, s7, 24
-; SI-NEXT:    s_lshr_b32 s10, s7, 16
-; SI-NEXT:    s_lshr_b32 s9, s7, 8
+; SI-NEXT:    s_and_b32 s5, s18, 0xffff
+; SI-NEXT:    s_lshl_b32 s6, s19, 16
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    s_add_i32 s4, s4, 0x30000
+; SI-NEXT:    s_add_i32 s5, s5, 0x30000
+; SI-NEXT:    s_lshr_b64 s[6:7], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[8:9], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[4:5], 8
+; SI-NEXT:    s_lshr_b32 s7, s5, 24
+; SI-NEXT:    s_lshr_b32 s11, s5, 16
+; SI-NEXT:    s_lshr_b32 s9, s5, 8
 ; SI-NEXT:  .LBB97_3: ; %end
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_mov_b32_e32 v4, s7
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s10
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v3, s6
+; SI-NEXT:    v_mov_b32_e32 v4, s5
 ; SI-NEXT:    v_mov_b32_e32 v5, s9
-; SI-NEXT:    v_mov_b32_e32 v6, s10
-; SI-NEXT:    v_mov_b32_e32 v7, s8
+; SI-NEXT:    v_mov_b32_e32 v6, s11
+; SI-NEXT:    v_mov_b32_e32 v7, s7
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB97_4:
-; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
-; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $sgpr9
+; SI-NEXT:    ; implicit-def: $sgpr4
 ; SI-NEXT:    ; implicit-def: $sgpr10
 ; SI-NEXT:    ; implicit-def: $sgpr8
+; SI-NEXT:    ; implicit-def: $sgpr6
+; SI-NEXT:    ; implicit-def: $sgpr9
+; SI-NEXT:    ; implicit-def: $sgpr11
+; SI-NEXT:    ; implicit-def: $sgpr7
 ; SI-NEXT:    s_branch .LBB97_2
 ;
 ; VI-LABEL: bitcast_v4i16_to_v8i8_scalar:
@@ -13970,27 +13984,27 @@ define inreg <4 x i16> @bitcast_v8i8_to_v4i16_scalar(<8 x i8> inreg %a, i32 inre
 ; SI-NEXT:    s_cmp_lg_u32 s24, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB99_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_and_b32 s4, s20, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s21, 8
+; SI-NEXT:    s_and_b32 s4, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s17, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s22, 0xff
+; SI-NEXT:    s_and_b32 s5, s18, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s6, s23, 24
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s6, s5
-; SI-NEXT:    s_or_b32 s7, s4, s5
-; SI-NEXT:    s_and_b32 s4, s18, 0xff
-; SI-NEXT:    s_lshl_b32 s4, s4, 16
 ; SI-NEXT:    s_lshl_b32 s6, s19, 24
-; SI-NEXT:    s_or_b32 s4, s6, s4
-; SI-NEXT:    s_and_b32 s6, s16, 0xff
-; SI-NEXT:    s_lshl_b32 s8, s17, 8
-; SI-NEXT:    s_or_b32 s6, s6, s8
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    s_and_b32 s6, s6, 0xffff
-; SI-NEXT:    v_alignbit_b32 v1, s7, v0, 16
-; SI-NEXT:    s_or_b32 s6, s6, s4
-; SI-NEXT:    s_lshr_b32 s8, s5, 16
+; SI-NEXT:    s_or_b32 s10, s6, s5
+; SI-NEXT:    s_and_b32 s5, s20, 0xff
+; SI-NEXT:    s_lshl_b32 s6, s21, 8
+; SI-NEXT:    s_or_b32 s5, s5, s6
+; SI-NEXT:    s_and_b32 s6, s22, 0xff
+; SI-NEXT:    s_lshl_b32 s6, s6, 16
+; SI-NEXT:    s_lshl_b32 s7, s23, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s8, s7, s6
+; SI-NEXT:    s_or_b32 s11, s5, s8
+; SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; SI-NEXT:    s_lshr_b64 s[6:7], s[10:11], 16
+; SI-NEXT:    s_or_b32 s4, s4, s10
+; SI-NEXT:    s_lshr_b32 s7, s8, 16
+; SI-NEXT:    s_mov_b32 s5, s11
 ; SI-NEXT:    s_cbranch_execnz .LBB99_3
 ; SI-NEXT:  .LBB99_2: ; %cmp.true
 ; SI-NEXT:    s_add_i32 s16, s16, 3
@@ -14004,34 +14018,33 @@ define inreg <4 x i16> @bitcast_v8i8_to_v4i16_scalar(<8 x i8> inreg %a, i32 inre
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s5, s6
-; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s20, s20, 3
-; SI-NEXT:    s_add_i32 s6, s4, 0x3000000
-; SI-NEXT:    s_and_b32 s4, s20, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s21, 8
-; SI-NEXT:    s_add_i32 s22, s22, 3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    s_and_b32 s5, s20, 0xff
+; SI-NEXT:    s_lshl_b32 s6, s21, 8
+; SI-NEXT:    s_add_i32 s22, s22, 3
+; SI-NEXT:    s_or_b32 s5, s6, s5
 ; SI-NEXT:    s_and_b32 s7, s22, 0xff
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s23, 24
+; SI-NEXT:    s_addk_i32 s5, 0x300
+; SI-NEXT:    s_lshl_b32 s6, s23, 24
 ; SI-NEXT:    s_lshl_b32 s7, s7, 16
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s7
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s7, s4, 0x3000000
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_alignbit_b32 v1, s7, v0, 16
-; SI-NEXT:    s_lshr_b32 s8, s7, 16
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    s_add_i32 s4, s4, 0x3000000
+; SI-NEXT:    s_add_i32 s5, s5, 0x3000000
+; SI-NEXT:    s_lshr_b64 s[6:7], s[4:5], 16
+; SI-NEXT:    s_lshr_b32 s7, s5, 16
 ; SI-NEXT:  .LBB99_3: ; %end
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_mov_b32_e32 v2, s7
-; SI-NEXT:    v_mov_b32_e32 v3, s8
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s6
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    v_mov_b32_e32 v3, s7
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB99_4:
+; SI-NEXT:    ; implicit-def: $sgpr4
 ; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $sgpr8
 ; SI-NEXT:    s_branch .LBB99_2
 ;
 ; VI-LABEL: bitcast_v8i8_to_v4i16_scalar:
@@ -15220,53 +15233,55 @@ define inreg <8 x i8> @bitcast_v4f16_to_v8i8_scalar(<4 x half> inreg %a, i32 inr
 ; SI-LABEL: bitcast_v4f16_to_v8i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, s17
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, s16
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, s16
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s19
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, s18
 ; SI-NEXT:    s_cmp_lg_u32 s20, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB105_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v11
+; SI-NEXT:    v_or_b32_e32 v9, v8, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; SI-NEXT:    v_or_b32_e32 v0, v9, v0
-; SI-NEXT:    v_or_b32_e32 v4, v8, v1
-; SI-NEXT:    v_alignbit_b32 v3, v4, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, v4, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, v4, v0, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
+; SI-NEXT:    v_or_b32_e32 v10, v0, v1
+; SI-NEXT:    v_lshr_b64 v[3:4], v[9:10], 24
+; SI-NEXT:    v_lshr_b64 v[4:5], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[9:10], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v10
 ; SI-NEXT:    v_bfe_u32 v7, v6, 8, 8
 ; SI-NEXT:    s_cbranch_execnz .LBB105_3
 ; SI-NEXT:  .LBB105_2: ; %cmp.true
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v8
-; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v2
-; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v3
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v3
+; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v9, v2, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; SI-NEXT:    v_or_b32_e32 v4, v2, v1
-; SI-NEXT:    v_alignbit_b32 v3, v4, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, v4, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, v4, v0, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
+; SI-NEXT:    v_or_b32_e32 v10, v0, v1
+; SI-NEXT:    v_lshr_b64 v[3:4], v[9:10], 24
+; SI-NEXT:    v_lshr_b64 v[4:5], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[9:10], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v10
 ; SI-NEXT:    v_bfe_u32 v7, v6, 8, 8
 ; SI-NEXT:  .LBB105_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v0, v9
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:    v_mov_b32_e32 v4, v10
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB105_4:
-; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr9
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr7
 ; SI-NEXT:    s_branch .LBB105_2
@@ -16420,48 +16435,50 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s20, 0
-; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v9, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v12, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s18
 ; SI-NEXT:    s_cbranch_scc0 .LBB109_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v8
-; SI-NEXT:    v_alignbit_b32 v0, v0, v11, 16
-; SI-NEXT:    v_alignbit_b32 v4, v6, v9, 16
-; SI-NEXT:    v_alignbit_b32 v3, v4, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, v4, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, v4, v0, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; SI-NEXT:    v_alignbit_b32 v9, v1, v12, 16
+; SI-NEXT:    v_alignbit_b32 v10, v6, v8, 16
+; SI-NEXT:    v_lshr_b64 v[3:4], v[9:10], 24
+; SI-NEXT:    v_lshr_b64 v[4:5], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[9:10], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v10
 ; SI-NEXT:    s_cbranch_execnz .LBB109_3
 ; SI-NEXT:  .LBB109_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v10
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v11
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v11
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v12
+; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; SI-NEXT:    v_alignbit_b32 v9, v2, v1, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v8
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v8
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v9
-; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
-; SI-NEXT:    v_alignbit_b32 v4, v6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v3, v4, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, v4, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, v4, v0, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; SI-NEXT:    v_alignbit_b32 v10, v6, v1, 16
+; SI-NEXT:    v_lshr_b64 v[3:4], v[9:10], 24
+; SI-NEXT:    v_lshr_b64 v[4:5], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[9:10], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
 ; SI-NEXT:  .LBB109_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v0, v9
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:    v_mov_b32_e32 v4, v10
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB109_4:
-; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr9
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr7
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
index 9f5c9c4c509ed..2cc7c448b2e11 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll
@@ -3022,264 +3022,260 @@ define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i3
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; SI-NEXT:    v_readfirstlane_b32 s13, v1
-; SI-NEXT:    v_readfirstlane_b32 s12, v2
-; SI-NEXT:    v_readfirstlane_b32 s11, v3
-; SI-NEXT:    v_readfirstlane_b32 s10, v4
-; SI-NEXT:    v_readfirstlane_b32 s9, v5
-; SI-NEXT:    v_readfirstlane_b32 s8, v6
-; SI-NEXT:    v_readfirstlane_b32 s7, v7
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_readfirstlane_b32 s6, v8
+; SI-NEXT:    v_readfirstlane_b32 s10, v1
+; SI-NEXT:    v_readfirstlane_b32 s11, v2
+; SI-NEXT:    v_readfirstlane_b32 s8, v3
+; SI-NEXT:    v_readfirstlane_b32 s9, v4
+; SI-NEXT:    v_readfirstlane_b32 s6, v5
+; SI-NEXT:    v_readfirstlane_b32 s7, v6
+; SI-NEXT:    v_readfirstlane_b32 s4, v7
+; SI-NEXT:    s_and_b64 s[12:13], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s5, v8
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s13
-; SI-NEXT:    v_mov_b32_e32 v5, s28
-; SI-NEXT:    v_mov_b32_e32 v6, s26
-; SI-NEXT:    v_mov_b32_e32 v7, s24
-; SI-NEXT:    v_mov_b32_e32 v8, s22
-; SI-NEXT:    v_mov_b32_e32 v9, s20
-; SI-NEXT:    v_mov_b32_e32 v10, s18
-; SI-NEXT:    v_mov_b32_e32 v11, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s10, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s12, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s29, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s27, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s25, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s23, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s21, v9, 16
-; SI-NEXT:    v_alignbit_b32 v10, s19, v10, 16
-; SI-NEXT:    v_alignbit_b32 v11, s17, v11, 16
-; SI-NEXT:    s_lshr_b32 s14, s6, 16
-; SI-NEXT:    s_lshr_b32 s15, s8, 16
-; SI-NEXT:    s_lshr_b32 s40, s10, 16
-; SI-NEXT:    s_lshr_b32 s41, s12, 16
-; SI-NEXT:    s_lshr_b32 s42, s29, 16
-; SI-NEXT:    s_lshr_b32 s43, s27, 16
-; SI-NEXT:    s_lshr_b32 s44, s25, 16
-; SI-NEXT:    s_lshr_b32 s45, s23, 16
-; SI-NEXT:    s_lshr_b32 s46, s21, 16
-; SI-NEXT:    s_lshr_b32 s47, s19, 16
-; SI-NEXT:    s_lshr_b32 s56, s17, 16
+; SI-NEXT:    s_lshr_b32 s76, s5, 16
+; SI-NEXT:    s_lshr_b32 s77, s7, 16
+; SI-NEXT:    s_lshr_b32 s78, s9, 16
+; SI-NEXT:    s_lshr_b32 s79, s11, 16
+; SI-NEXT:    s_lshr_b32 s88, s29, 16
+; SI-NEXT:    s_lshr_b32 s89, s27, 16
+; SI-NEXT:    s_lshr_b32 s90, s25, 16
+; SI-NEXT:    s_lshr_b32 s91, s23, 16
+; SI-NEXT:    s_lshr_b32 s92, s21, 16
+; SI-NEXT:    s_lshr_b32 s93, s19, 16
+; SI-NEXT:    s_lshr_b32 s94, s17, 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[72:73], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB13_3
 ; SI-NEXT:  .LBB13_2: ; %cmp.true
+; SI-NEXT:    s_add_i32 s17, s17, 3
 ; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    s_add_i32 s19, s19, 3
 ; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_add_i32 s21, s21, 3
 ; SI-NEXT:    s_add_i32 s20, s20, 3
+; SI-NEXT:    s_add_i32 s23, s23, 3
 ; SI-NEXT:    s_add_i32 s22, s22, 3
+; SI-NEXT:    s_add_i32 s25, s25, 3
 ; SI-NEXT:    s_add_i32 s24, s24, 3
+; SI-NEXT:    s_add_i32 s27, s27, 3
 ; SI-NEXT:    s_add_i32 s26, s26, 3
+; SI-NEXT:    s_add_i32 s29, s29, 3
 ; SI-NEXT:    s_add_i32 s28, s28, 3
-; SI-NEXT:    s_add_i32 s13, s13, 3
 ; SI-NEXT:    s_add_i32 s11, s11, 3
-; SI-NEXT:    s_add_i32 s9, s9, 3
-; SI-NEXT:    s_add_i32 s7, s7, 3
-; SI-NEXT:    s_add_i32 s17, s17, 3
-; SI-NEXT:    s_add_i32 s19, s19, 3
-; SI-NEXT:    s_add_i32 s21, s21, 3
-; SI-NEXT:    s_add_i32 s23, s23, 3
-; SI-NEXT:    s_add_i32 s25, s25, 3
-; SI-NEXT:    s_add_i32 s27, s27, 3
-; SI-NEXT:    s_add_i32 s29, s29, 3
-; SI-NEXT:    s_add_i32 s12, s12, 3
 ; SI-NEXT:    s_add_i32 s10, s10, 3
+; SI-NEXT:    s_add_i32 s9, s9, 3
 ; SI-NEXT:    s_add_i32 s8, s8, 3
+; SI-NEXT:    s_add_i32 s7, s7, 3
 ; SI-NEXT:    s_add_i32 s6, s6, 3
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s13
-; SI-NEXT:    v_mov_b32_e32 v5, s28
-; SI-NEXT:    v_mov_b32_e32 v6, s26
-; SI-NEXT:    v_mov_b32_e32 v7, s24
-; SI-NEXT:    v_mov_b32_e32 v8, s22
-; SI-NEXT:    v_mov_b32_e32 v9, s20
-; SI-NEXT:    v_mov_b32_e32 v10, s18
-; SI-NEXT:    v_mov_b32_e32 v11, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s10, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s12, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s29, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s27, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s25, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s23, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s21, v9, 16
-; SI-NEXT:    v_alignbit_b32 v10, s19, v10, 16
-; SI-NEXT:    v_alignbit_b32 v11, s17, v11, 16
-; SI-NEXT:    s_lshr_b32 s14, s6, 16
-; SI-NEXT:    s_lshr_b32 s15, s8, 16
-; SI-NEXT:    s_lshr_b32 s40, s10, 16
-; SI-NEXT:    s_lshr_b32 s41, s12, 16
-; SI-NEXT:    s_lshr_b32 s42, s29, 16
-; SI-NEXT:    s_lshr_b32 s43, s27, 16
-; SI-NEXT:    s_lshr_b32 s44, s25, 16
-; SI-NEXT:    s_lshr_b32 s45, s23, 16
-; SI-NEXT:    s_lshr_b32 s46, s21, 16
-; SI-NEXT:    s_lshr_b32 s47, s19, 16
-; SI-NEXT:    s_lshr_b32 s56, s17, 16
+; SI-NEXT:    s_add_i32 s5, s5, 3
+; SI-NEXT:    s_add_i32 s4, s4, 3
+; SI-NEXT:    s_lshr_b64 s[12:13], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[26:27], 16
+; SI-NEXT:    s_lshr_b32 s76, s5, 16
+; SI-NEXT:    s_lshr_b32 s77, s7, 16
+; SI-NEXT:    s_lshr_b32 s78, s9, 16
+; SI-NEXT:    s_lshr_b32 s79, s11, 16
+; SI-NEXT:    s_lshr_b32 s88, s29, 16
+; SI-NEXT:    s_lshr_b32 s89, s27, 16
+; SI-NEXT:    s_lshr_b32 s90, s25, 16
+; SI-NEXT:    s_lshr_b32 s91, s23, 16
+; SI-NEXT:    s_lshr_b32 s92, s21, 16
+; SI-NEXT:    s_lshr_b32 s93, s19, 16
+; SI-NEXT:    s_lshr_b32 s94, s17, 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[72:73], s[16:17], 16
 ; SI-NEXT:  .LBB13_3: ; %end
-; SI-NEXT:    s_and_b32 s4, s16, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_or_b32_e32 v11, s4, v11
-; SI-NEXT:    s_and_b32 s4, s17, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s56, 16
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_mov_b32_e32 v12, s4
-; SI-NEXT:    s_and_b32 s4, s18, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 4, v0
-; SI-NEXT:    v_or_b32_e32 v10, s4, v10
-; SI-NEXT:    s_and_b32 s4, s19, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s47, 16
-; SI-NEXT:    buffer_store_dword v12, v11, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 8, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v11, s4
-; SI-NEXT:    s_and_b32 s4, s20, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 12, v0
-; SI-NEXT:    v_or_b32_e32 v9, s4, v9
-; SI-NEXT:    s_and_b32 s4, s21, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s46, 16
-; SI-NEXT:    buffer_store_dword v11, v10, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 16, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v10, s4
-; SI-NEXT:    s_and_b32 s4, s22, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT:    s_lshl_b32 s13, s72, 16
+; SI-NEXT:    s_and_b32 s15, s16, 0xffff
+; SI-NEXT:    s_or_b32 s13, s15, s13
+; SI-NEXT:    v_mov_b32_e32 v1, s13
+; SI-NEXT:    s_and_b32 s13, s17, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s94, 16
+; SI-NEXT:    s_or_b32 s13, s13, s15
+; SI-NEXT:    v_mov_b32_e32 v2, s13
+; SI-NEXT:    s_lshl_b32 s13, s62, 16
+; SI-NEXT:    s_and_b32 s15, s18, 0xffff
+; SI-NEXT:    s_or_b32 s13, s15, s13
+; SI-NEXT:    v_mov_b32_e32 v3, s13
+; SI-NEXT:    s_and_b32 s13, s19, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s93, 16
+; SI-NEXT:    s_or_b32 s13, s13, s15
+; SI-NEXT:    v_mov_b32_e32 v4, s13
+; SI-NEXT:    s_lshl_b32 s13, s60, 16
+; SI-NEXT:    s_and_b32 s15, s20, 0xffff
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 20, v0
-; SI-NEXT:    v_or_b32_e32 v8, s4, v8
-; SI-NEXT:    s_and_b32 s4, s23, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s45, 16
-; SI-NEXT:    buffer_store_dword v10, v9, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 24, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v8, v9, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v9, s4
-; SI-NEXT:    s_and_b32 s4, s24, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; SI-NEXT:    s_or_b32 s13, s15, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; SI-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v2, s13
+; SI-NEXT:    s_and_b32 s13, s21, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s92, 16
+; SI-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; SI-NEXT:    s_or_b32 s13, s13, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 28, v0
-; SI-NEXT:    v_or_b32_e32 v7, s4, v7
-; SI-NEXT:    s_and_b32 s4, s25, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s44, 16
-; SI-NEXT:    buffer_store_dword v9, v8, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 32, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v8, s4
-; SI-NEXT:    s_and_b32 s4, s26, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT:    v_mov_b32_e32 v2, s13
+; SI-NEXT:    s_and_b32 s13, s22, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s58, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 20, v0
+; SI-NEXT:    s_or_b32 s13, s13, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 36, v0
-; SI-NEXT:    v_or_b32_e32 v6, s4, v6
-; SI-NEXT:    s_and_b32 s4, s27, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s43, 16
-; SI-NEXT:    buffer_store_dword v8, v7, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 40, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    s_and_b32 s4, s28, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_mov_b32_e32 v2, s13
+; SI-NEXT:    s_and_b32 s13, s23, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s91, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 24, v0
+; SI-NEXT:    s_or_b32 s13, s13, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 44, v0
-; SI-NEXT:    v_or_b32_e32 v5, s4, v5
-; SI-NEXT:    s_and_b32 s4, s29, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s42, 16
-; SI-NEXT:    buffer_store_dword v7, v6, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v6, s4
-; SI-NEXT:    s_and_b32 s4, s13, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_mov_b32_e32 v2, s13
+; SI-NEXT:    s_and_b32 s13, s24, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s56, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s13, s13, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 52, v0
-; SI-NEXT:    v_or_b32_e32 v4, s4, v4
-; SI-NEXT:    s_and_b32 s4, s12, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s41, 16
-; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 56, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v5, s4
-; SI-NEXT:    s_and_b32 s4, s11, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_mov_b32_e32 v2, s13
+; SI-NEXT:    s_and_b32 s13, s25, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s90, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v0
+; SI-NEXT:    s_or_b32 s13, s13, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 60, v0
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s10, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s40, 16
-; SI-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 64, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v4, s4
-; SI-NEXT:    s_and_b32 s4, s9, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_mov_b32_e32 v2, s13
+; SI-NEXT:    s_and_b32 s13, s26, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s46, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 36, v0
+; SI-NEXT:    s_or_b32 s13, s13, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x44, v0
-; SI-NEXT:    v_or_b32_e32 v2, s4, v2
-; SI-NEXT:    s_and_b32 s4, s8, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s15, 16
-; SI-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v3, s4
-; SI-NEXT:    s_and_b32 s4, s7, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_mov_b32_e32 v2, s13
+; SI-NEXT:    s_and_b32 s13, s27, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s89, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 40, v0
+; SI-NEXT:    s_or_b32 s13, s13, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
-; SI-NEXT:    v_or_b32_e32 v1, s4, v1
-; SI-NEXT:    s_and_b32 s4, s6, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s14, 16
-; SI-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x50, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s13
+; SI-NEXT:    s_and_b32 s13, s28, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s44, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 44, v0
+; SI-NEXT:    s_or_b32 s13, s13, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s13
+; SI-NEXT:    s_and_b32 s13, s29, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s88, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 48, v0
+; SI-NEXT:    s_or_b32 s13, s13, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s13
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s42, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 52, v0
+; SI-NEXT:    s_or_b32 s10, s10, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s10, s11, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s79, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 56, v0
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    s_lshl_b32 s10, s40, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 60, v0
+; SI-NEXT:    s_or_b32 s8, s8, s10
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s8, s9, 0xffff
+; SI-NEXT:    s_lshl_b32 s9, s78, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 64, v0
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_lshl_b32 s8, s14, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x44, v0
+; SI-NEXT:    s_or_b32 s6, s6, s8
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s6, s7, 0xffff
+; SI-NEXT:    s_lshl_b32 s7, s77, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x48, v0
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; SI-NEXT:    s_lshl_b32 s6, s12, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x4c, v0
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    s_and_b32 s4, s5, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s76, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x50, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x54, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB13_4:
-; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $sgpr72
+; SI-NEXT:    ; implicit-def: $sgpr94
+; SI-NEXT:    ; implicit-def: $sgpr62
+; SI-NEXT:    ; implicit-def: $sgpr93
+; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr92
+; SI-NEXT:    ; implicit-def: $sgpr58
+; SI-NEXT:    ; implicit-def: $sgpr91
 ; SI-NEXT:    ; implicit-def: $sgpr56
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $sgpr47
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $sgpr90
 ; SI-NEXT:    ; implicit-def: $sgpr46
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $sgpr45
-; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $sgpr89
 ; SI-NEXT:    ; implicit-def: $sgpr44
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $sgpr43
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $sgpr88
 ; SI-NEXT:    ; implicit-def: $sgpr42
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $sgpr41
-; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $sgpr79
 ; SI-NEXT:    ; implicit-def: $sgpr40
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $sgpr15
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr78
 ; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr77
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr76
 ; SI-NEXT:    s_branch .LBB13_2
 ;
 ; VI-LABEL: bitcast_v22i32_to_v44i16_scalar:
@@ -11524,171 +11520,171 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a,
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; SI-NEXT:    v_mov_b32_e32 v23, s16
+; SI-NEXT:    v_mov_b32_e32 v21, s16
 ; SI-NEXT:    v_mov_b32_e32 v22, s17
-; SI-NEXT:    v_mov_b32_e32 v21, s18
-; SI-NEXT:    v_mov_b32_e32 v19, s19
-; SI-NEXT:    v_mov_b32_e32 v18, s20
-; SI-NEXT:    v_mov_b32_e32 v17, s21
-; SI-NEXT:    v_mov_b32_e32 v16, s22
-; SI-NEXT:    v_mov_b32_e32 v15, s23
-; SI-NEXT:    v_mov_b32_e32 v14, s24
-; SI-NEXT:    v_mov_b32_e32 v12, s25
-; SI-NEXT:    v_mov_b32_e32 v13, s26
+; SI-NEXT:    v_mov_b32_e32 v19, s18
+; SI-NEXT:    v_mov_b32_e32 v20, s19
+; SI-NEXT:    v_mov_b32_e32 v17, s20
+; SI-NEXT:    v_mov_b32_e32 v18, s21
+; SI-NEXT:    v_mov_b32_e32 v15, s22
+; SI-NEXT:    v_mov_b32_e32 v16, s23
+; SI-NEXT:    v_mov_b32_e32 v13, s24
+; SI-NEXT:    v_mov_b32_e32 v14, s25
+; SI-NEXT:    v_mov_b32_e32 v11, s26
+; SI-NEXT:    v_mov_b32_e32 v12, s27
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mov_b32_e32 v11, s27
-; SI-NEXT:    v_mov_b32_e32 v10, s28
-; SI-NEXT:    v_mov_b32_e32 v9, s29
+; SI-NEXT:    v_mov_b32_e32 v9, s28
+; SI-NEXT:    v_mov_b32_e32 v10, s29
 ; SI-NEXT:    s_cbranch_scc0 .LBB29_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_alignbit_b32 v20, v8, v7, 16
-; SI-NEXT:    v_alignbit_b32 v24, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v25, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v26, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v28, v9, v10, 16
-; SI-NEXT:    v_alignbit_b32 v30, v11, v13, 16
-; SI-NEXT:    v_alignbit_b32 v32, v12, v14, 16
-; SI-NEXT:    v_alignbit_b32 v34, v15, v16, 16
-; SI-NEXT:    v_alignbit_b32 v37, v17, v18, 16
-; SI-NEXT:    v_alignbit_b32 v39, v19, v21, 16
-; SI-NEXT:    v_alignbit_b32 v49, v22, v23, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v22
+; SI-NEXT:    v_lshr_b64 v[23:24], v[7:8], 16
+; SI-NEXT:    v_lshr_b64 v[24:25], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[25:26], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[26:27], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[27:28], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[28:29], v[11:12], 16
+; SI-NEXT:    v_lshr_b64 v[29:30], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[30:31], v[15:16], 16
+; SI-NEXT:    v_lshr_b64 v[31:32], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[32:33], v[19:20], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v22
+; SI-NEXT:    v_lshr_b64 v[33:34], v[21:22], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB29_3
 ; SI-NEXT:  .LBB29_2: ; %cmp.true
-; SI-NEXT:    v_add_f32_e32 v22, 1.0, v22
-; SI-NEXT:    v_add_f32_e32 v23, 1.0, v23
-; SI-NEXT:    v_add_f32_e32 v19, 1.0, v19
-; SI-NEXT:    v_add_f32_e32 v21, 1.0, v21
-; SI-NEXT:    v_add_f32_e32 v17, 1.0, v17
-; SI-NEXT:    v_add_f32_e32 v18, 1.0, v18
-; SI-NEXT:    v_add_f32_e32 v15, 1.0, v15
-; SI-NEXT:    v_add_f32_e32 v16, 1.0, v16
+; SI-NEXT:    v_add_f32_e32 v8, 1.0, v8
+; SI-NEXT:    v_add_f32_e32 v7, 1.0, v7
+; SI-NEXT:    v_add_f32_e32 v6, 1.0, v6
+; SI-NEXT:    v_add_f32_e32 v5, 1.0, v5
+; SI-NEXT:    v_lshr_b64 v[23:24], v[7:8], 16
+; SI-NEXT:    v_add_f32_e32 v4, 1.0, v4
+; SI-NEXT:    v_add_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_lshr_b64 v[24:25], v[5:6], 16
+; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[25:26], v[3:4], 16
+; SI-NEXT:    v_add_f32_e32 v10, 1.0, v10
+; SI-NEXT:    v_add_f32_e32 v9, 1.0, v9
+; SI-NEXT:    v_lshr_b64 v[26:27], v[1:2], 16
 ; SI-NEXT:    v_add_f32_e32 v12, 1.0, v12
-; SI-NEXT:    v_add_f32_e32 v14, 1.0, v14
 ; SI-NEXT:    v_add_f32_e32 v11, 1.0, v11
+; SI-NEXT:    v_lshr_b64 v[27:28], v[9:10], 16
+; SI-NEXT:    v_add_f32_e32 v14, 1.0, v14
 ; SI-NEXT:    v_add_f32_e32 v13, 1.0, v13
-; SI-NEXT:    v_add_f32_e32 v9, 1.0, v9
-; SI-NEXT:    v_add_f32_e32 v10, 1.0, v10
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_add_f32_e32 v4, 1.0, v4
-; SI-NEXT:    v_add_f32_e32 v3, 1.0, v3
-; SI-NEXT:    v_add_f32_e32 v6, 1.0, v6
-; SI-NEXT:    v_add_f32_e32 v5, 1.0, v5
-; SI-NEXT:    v_add_f32_e32 v8, 1.0, v8
-; SI-NEXT:    v_add_f32_e32 v7, 1.0, v7
-; SI-NEXT:    v_alignbit_b32 v20, v8, v7, 16
-; SI-NEXT:    v_alignbit_b32 v24, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v25, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v26, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v28, v9, v10, 16
-; SI-NEXT:    v_alignbit_b32 v30, v11, v13, 16
-; SI-NEXT:    v_alignbit_b32 v32, v12, v14, 16
-; SI-NEXT:    v_alignbit_b32 v34, v15, v16, 16
-; SI-NEXT:    v_alignbit_b32 v37, v17, v18, 16
-; SI-NEXT:    v_alignbit_b32 v39, v19, v21, 16
-; SI-NEXT:    v_alignbit_b32 v49, v22, v23, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v27, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v9
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v22
+; SI-NEXT:    v_lshr_b64 v[28:29], v[11:12], 16
+; SI-NEXT:    v_add_f32_e32 v16, 1.0, v16
+; SI-NEXT:    v_add_f32_e32 v15, 1.0, v15
+; SI-NEXT:    v_lshr_b64 v[29:30], v[13:14], 16
+; SI-NEXT:    v_add_f32_e32 v18, 1.0, v18
+; SI-NEXT:    v_add_f32_e32 v17, 1.0, v17
+; SI-NEXT:    v_lshr_b64 v[30:31], v[15:16], 16
+; SI-NEXT:    v_add_f32_e32 v20, 1.0, v20
+; SI-NEXT:    v_add_f32_e32 v19, 1.0, v19
+; SI-NEXT:    v_lshr_b64 v[31:32], v[17:18], 16
+; SI-NEXT:    v_add_f32_e32 v22, 1.0, v22
+; SI-NEXT:    v_add_f32_e32 v21, 1.0, v21
+; SI-NEXT:    v_lshr_b64 v[32:33], v[19:20], 16
+; SI-NEXT:    v_lshr_b64 v[33:34], v[21:22], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v22
 ; SI-NEXT:  .LBB29_3: ; %end
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; SI-NEXT:    v_lshlrev_b32_e32 v49, 16, v49
-; SI-NEXT:    v_or_b32_e32 v23, v23, v49
-; SI-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v52
-; SI-NEXT:    v_or_b32_e32 v22, v22, v23
-; SI-NEXT:    v_add_i32_e32 v23, vcc, 4, v0
-; SI-NEXT:    buffer_store_dword v22, v23, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v33
 ; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; SI-NEXT:    v_or_b32_e32 v21, v21, v33
+; SI-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v39
+; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v53
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v22
-; SI-NEXT:    v_add_i32_e32 v22, vcc, 8, v0
+; SI-NEXT:    v_add_i32_e32 v22, vcc, 4, v0
 ; SI-NEXT:    buffer_store_dword v21, v22, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v51
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v32
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v21
-; SI-NEXT:    v_add_i32_e32 v21, vcc, 12, v0
+; SI-NEXT:    v_add_i32_e32 v21, vcc, 8, v0
 ; SI-NEXT:    buffer_store_dword v19, v21, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v18
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v37
-; SI-NEXT:    v_or_b32_e32 v18, v18, v19
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 16, v0
-; SI-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v20
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v52
+; SI-NEXT:    v_or_b32_e32 v19, v19, v20
+; SI-NEXT:    v_add_i32_e32 v20, vcc, 12, v0
+; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v31
 ; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; SI-NEXT:    v_or_b32_e32 v17, v17, v19
+; SI-NEXT:    v_add_i32_e32 v19, vcc, 16, v0
+; SI-NEXT:    buffer_store_dword v17, v19, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v50
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v51
 ; SI-NEXT:    v_or_b32_e32 v17, v17, v18
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v34
-; SI-NEXT:    v_or_b32_e32 v16, v16, v17
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v30
+; SI-NEXT:    v_or_b32_e32 v15, v15, v17
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, 24, v0
-; SI-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; SI-NEXT:    buffer_store_dword v15, v17, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v48
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v50
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v16
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v32
-; SI-NEXT:    v_or_b32_e32 v14, v14, v15
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v29
+; SI-NEXT:    v_or_b32_e32 v13, v13, v15
 ; SI-NEXT:    v_add_i32_e32 v15, vcc, 32, v0
-; SI-NEXT:    buffer_store_dword v14, v15, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; SI-NEXT:    buffer_store_dword v13, v15, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v38
-; SI-NEXT:    v_or_b32_e32 v12, v12, v14
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v49
+; SI-NEXT:    v_or_b32_e32 v13, v13, v14
 ; SI-NEXT:    v_add_i32_e32 v14, vcc, 36, v0
-; SI-NEXT:    buffer_store_dword v12, v14, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v30
-; SI-NEXT:    v_or_b32_e32 v12, v12, v13
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v28
+; SI-NEXT:    v_or_b32_e32 v11, v11, v13
 ; SI-NEXT:    v_add_i32_e32 v13, vcc, 40, v0
-; SI-NEXT:    buffer_store_dword v12, v13, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT:    buffer_store_dword v11, v13, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v36
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v12
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v48
 ; SI-NEXT:    v_or_b32_e32 v11, v11, v12
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v11, v12, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v28
-; SI-NEXT:    v_or_b32_e32 v10, v10, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v27
+; SI-NEXT:    v_or_b32_e32 v9, v9, v11
 ; SI-NEXT:    v_add_i32_e32 v11, vcc, 48, v0
-; SI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; SI-NEXT:    buffer_store_dword v9, v11, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v35
+; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v39
 ; SI-NEXT:    v_or_b32_e32 v9, v9, v10
 ; SI-NEXT:    v_add_i32_e32 v10, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
@@ -11700,7 +11696,7 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v9, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v33
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v38
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -11712,7 +11708,7 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v31
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v37
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -11724,47 +11720,47 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v29
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v36
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v20
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v23
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x50, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v35
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB29_4:
-; SI-NEXT:    ; implicit-def: $vgpr49
+; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr53
+; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr52
-; SI-NEXT:    ; implicit-def: $vgpr39
+; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    ; implicit-def: $vgpr51
-; SI-NEXT:    ; implicit-def: $vgpr37
+; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    ; implicit-def: $vgpr50
-; SI-NEXT:    ; implicit-def: $vgpr34
+; SI-NEXT:    ; implicit-def: $vgpr29
+; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr38
-; SI-NEXT:    ; implicit-def: $vgpr30
+; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr28
 ; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    ; implicit-def: $vgpr28
+; SI-NEXT:    ; implicit-def: $vgpr27
 ; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr20
-; SI-NEXT:    ; implicit-def: $vgpr27
+; SI-NEXT:    ; implicit-def: $vgpr23
 ; SI-NEXT:    s_branch .LBB29_2
 ;
 ; VI-LABEL: bitcast_v22f32_to_v44i16_scalar:
@@ -19266,264 +19262,260 @@ define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i3
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; SI-NEXT:    v_readfirstlane_b32 s13, v1
-; SI-NEXT:    v_readfirstlane_b32 s12, v2
-; SI-NEXT:    v_readfirstlane_b32 s11, v3
-; SI-NEXT:    v_readfirstlane_b32 s10, v4
-; SI-NEXT:    v_readfirstlane_b32 s9, v5
-; SI-NEXT:    v_readfirstlane_b32 s8, v6
-; SI-NEXT:    v_readfirstlane_b32 s7, v7
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_readfirstlane_b32 s6, v8
+; SI-NEXT:    v_readfirstlane_b32 s10, v1
+; SI-NEXT:    v_readfirstlane_b32 s11, v2
+; SI-NEXT:    v_readfirstlane_b32 s8, v3
+; SI-NEXT:    v_readfirstlane_b32 s9, v4
+; SI-NEXT:    v_readfirstlane_b32 s6, v5
+; SI-NEXT:    v_readfirstlane_b32 s7, v6
+; SI-NEXT:    v_readfirstlane_b32 s4, v7
+; SI-NEXT:    s_and_b64 s[12:13], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s5, v8
 ; SI-NEXT:    s_cbranch_scc0 .LBB41_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s13
-; SI-NEXT:    v_mov_b32_e32 v5, s28
-; SI-NEXT:    v_mov_b32_e32 v6, s26
-; SI-NEXT:    v_mov_b32_e32 v7, s24
-; SI-NEXT:    v_mov_b32_e32 v8, s22
-; SI-NEXT:    v_mov_b32_e32 v9, s20
-; SI-NEXT:    v_mov_b32_e32 v10, s18
-; SI-NEXT:    v_mov_b32_e32 v11, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s10, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s12, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s29, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s27, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s25, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s23, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s21, v9, 16
-; SI-NEXT:    v_alignbit_b32 v10, s19, v10, 16
-; SI-NEXT:    v_alignbit_b32 v11, s17, v11, 16
-; SI-NEXT:    s_lshr_b32 s14, s6, 16
-; SI-NEXT:    s_lshr_b32 s15, s8, 16
-; SI-NEXT:    s_lshr_b32 s40, s10, 16
-; SI-NEXT:    s_lshr_b32 s41, s12, 16
-; SI-NEXT:    s_lshr_b32 s42, s29, 16
-; SI-NEXT:    s_lshr_b32 s43, s27, 16
-; SI-NEXT:    s_lshr_b32 s44, s25, 16
-; SI-NEXT:    s_lshr_b32 s45, s23, 16
-; SI-NEXT:    s_lshr_b32 s46, s21, 16
-; SI-NEXT:    s_lshr_b32 s47, s19, 16
-; SI-NEXT:    s_lshr_b32 s56, s17, 16
+; SI-NEXT:    s_lshr_b32 s76, s5, 16
+; SI-NEXT:    s_lshr_b32 s77, s7, 16
+; SI-NEXT:    s_lshr_b32 s78, s9, 16
+; SI-NEXT:    s_lshr_b32 s79, s11, 16
+; SI-NEXT:    s_lshr_b32 s88, s29, 16
+; SI-NEXT:    s_lshr_b32 s89, s27, 16
+; SI-NEXT:    s_lshr_b32 s90, s25, 16
+; SI-NEXT:    s_lshr_b32 s91, s23, 16
+; SI-NEXT:    s_lshr_b32 s92, s21, 16
+; SI-NEXT:    s_lshr_b32 s93, s19, 16
+; SI-NEXT:    s_lshr_b32 s94, s17, 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[72:73], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB41_3
 ; SI-NEXT:  .LBB41_2: ; %cmp.true
-; SI-NEXT:    s_add_u32 s16, s16, 3
-; SI-NEXT:    s_addc_u32 s17, s17, 0
-; SI-NEXT:    s_add_u32 s18, s18, 3
-; SI-NEXT:    s_addc_u32 s19, s19, 0
-; SI-NEXT:    s_add_u32 s20, s20, 3
-; SI-NEXT:    s_addc_u32 s21, s21, 0
-; SI-NEXT:    s_add_u32 s22, s22, 3
-; SI-NEXT:    s_addc_u32 s23, s23, 0
-; SI-NEXT:    s_add_u32 s24, s24, 3
-; SI-NEXT:    s_addc_u32 s25, s25, 0
-; SI-NEXT:    s_add_u32 s26, s26, 3
-; SI-NEXT:    s_addc_u32 s27, s27, 0
+; SI-NEXT:    s_add_u32 s4, s4, 3
+; SI-NEXT:    s_addc_u32 s5, s5, 0
+; SI-NEXT:    s_add_u32 s6, s6, 3
+; SI-NEXT:    s_addc_u32 s7, s7, 0
+; SI-NEXT:    s_add_u32 s8, s8, 3
+; SI-NEXT:    s_addc_u32 s9, s9, 0
+; SI-NEXT:    s_add_u32 s10, s10, 3
+; SI-NEXT:    s_addc_u32 s11, s11, 0
 ; SI-NEXT:    s_add_u32 s28, s28, 3
 ; SI-NEXT:    s_addc_u32 s29, s29, 0
-; SI-NEXT:    s_add_u32 s13, s13, 3
-; SI-NEXT:    s_addc_u32 s12, s12, 0
-; SI-NEXT:    s_add_u32 s11, s11, 3
-; SI-NEXT:    s_addc_u32 s10, s10, 0
-; SI-NEXT:    s_add_u32 s9, s9, 3
-; SI-NEXT:    s_addc_u32 s8, s8, 0
-; SI-NEXT:    s_add_u32 s7, s7, 3
-; SI-NEXT:    s_addc_u32 s6, s6, 0
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s13
-; SI-NEXT:    v_mov_b32_e32 v5, s28
-; SI-NEXT:    v_mov_b32_e32 v6, s26
-; SI-NEXT:    v_mov_b32_e32 v7, s24
-; SI-NEXT:    v_mov_b32_e32 v8, s22
-; SI-NEXT:    v_mov_b32_e32 v9, s20
-; SI-NEXT:    v_mov_b32_e32 v10, s18
-; SI-NEXT:    v_mov_b32_e32 v11, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s10, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s12, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s29, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s27, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s25, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s23, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s21, v9, 16
-; SI-NEXT:    v_alignbit_b32 v10, s19, v10, 16
-; SI-NEXT:    v_alignbit_b32 v11, s17, v11, 16
-; SI-NEXT:    s_lshr_b32 s14, s6, 16
-; SI-NEXT:    s_lshr_b32 s15, s8, 16
-; SI-NEXT:    s_lshr_b32 s40, s10, 16
-; SI-NEXT:    s_lshr_b32 s41, s12, 16
-; SI-NEXT:    s_lshr_b32 s42, s29, 16
-; SI-NEXT:    s_lshr_b32 s43, s27, 16
-; SI-NEXT:    s_lshr_b32 s44, s25, 16
-; SI-NEXT:    s_lshr_b32 s45, s23, 16
-; SI-NEXT:    s_lshr_b32 s46, s21, 16
-; SI-NEXT:    s_lshr_b32 s47, s19, 16
-; SI-NEXT:    s_lshr_b32 s56, s17, 16
+; SI-NEXT:    s_add_u32 s26, s26, 3
+; SI-NEXT:    s_addc_u32 s27, s27, 0
+; SI-NEXT:    s_add_u32 s24, s24, 3
+; SI-NEXT:    s_addc_u32 s25, s25, 0
+; SI-NEXT:    s_add_u32 s22, s22, 3
+; SI-NEXT:    s_addc_u32 s23, s23, 0
+; SI-NEXT:    s_add_u32 s20, s20, 3
+; SI-NEXT:    s_addc_u32 s21, s21, 0
+; SI-NEXT:    s_add_u32 s18, s18, 3
+; SI-NEXT:    s_addc_u32 s19, s19, 0
+; SI-NEXT:    s_add_u32 s16, s16, 3
+; SI-NEXT:    s_addc_u32 s17, s17, 0
+; SI-NEXT:    s_lshr_b32 s76, s5, 16
+; SI-NEXT:    s_lshr_b32 s77, s7, 16
+; SI-NEXT:    s_lshr_b32 s78, s9, 16
+; SI-NEXT:    s_lshr_b32 s79, s11, 16
+; SI-NEXT:    s_lshr_b32 s88, s29, 16
+; SI-NEXT:    s_lshr_b32 s89, s27, 16
+; SI-NEXT:    s_lshr_b32 s90, s25, 16
+; SI-NEXT:    s_lshr_b32 s91, s23, 16
+; SI-NEXT:    s_lshr_b32 s92, s21, 16
+; SI-NEXT:    s_lshr_b32 s93, s19, 16
+; SI-NEXT:    s_lshr_b32 s94, s17, 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[72:73], s[16:17], 16
 ; SI-NEXT:  .LBB41_3: ; %end
-; SI-NEXT:    s_and_b32 s4, s16, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    v_or_b32_e32 v11, s4, v11
-; SI-NEXT:    s_and_b32 s4, s17, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s56, 16
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_mov_b32_e32 v12, s4
-; SI-NEXT:    s_and_b32 s4, s18, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 4, v0
-; SI-NEXT:    v_or_b32_e32 v10, s4, v10
-; SI-NEXT:    s_and_b32 s4, s19, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s47, 16
-; SI-NEXT:    buffer_store_dword v12, v11, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 8, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v11, s4
-; SI-NEXT:    s_and_b32 s4, s20, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 12, v0
-; SI-NEXT:    v_or_b32_e32 v9, s4, v9
-; SI-NEXT:    s_and_b32 s4, s21, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s46, 16
-; SI-NEXT:    buffer_store_dword v11, v10, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 16, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v10, s4
-; SI-NEXT:    s_and_b32 s4, s22, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT:    s_lshl_b32 s13, s72, 16
+; SI-NEXT:    s_and_b32 s15, s16, 0xffff
+; SI-NEXT:    s_or_b32 s13, s15, s13
+; SI-NEXT:    v_mov_b32_e32 v1, s13
+; SI-NEXT:    s_and_b32 s13, s17, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s94, 16
+; SI-NEXT:    s_or_b32 s13, s13, s15
+; SI-NEXT:    v_mov_b32_e32 v2, s13
+; SI-NEXT:    s_lshl_b32 s13, s62, 16
+; SI-NEXT:    s_and_b32 s15, s18, 0xffff
+; SI-NEXT:    s_or_b32 s13, s15, s13
+; SI-NEXT:    v_mov_b32_e32 v3, s13
+; SI-NEXT:    s_and_b32 s13, s19, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s93, 16
+; SI-NEXT:    s_or_b32 s13, s13, s15
+; SI-NEXT:    v_mov_b32_e32 v4, s13
+; SI-NEXT:    s_lshl_b32 s13, s60, 16
+; SI-NEXT:    s_and_b32 s15, s20, 0xffff
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 20, v0
-; SI-NEXT:    v_or_b32_e32 v8, s4, v8
-; SI-NEXT:    s_and_b32 s4, s23, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s45, 16
-; SI-NEXT:    buffer_store_dword v10, v9, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 24, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v8, v9, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v9, s4
-; SI-NEXT:    s_and_b32 s4, s24, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; SI-NEXT:    s_or_b32 s13, s15, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; SI-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v2, s13
+; SI-NEXT:    s_and_b32 s13, s21, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s92, 16
+; SI-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; SI-NEXT:    s_or_b32 s13, s13, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 28, v0
-; SI-NEXT:    v_or_b32_e32 v7, s4, v7
-; SI-NEXT:    s_and_b32 s4, s25, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s44, 16
-; SI-NEXT:    buffer_store_dword v9, v8, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 32, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v8, s4
-; SI-NEXT:    s_and_b32 s4, s26, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT:    v_mov_b32_e32 v2, s13
+; SI-NEXT:    s_and_b32 s13, s22, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s58, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 20, v0
+; SI-NEXT:    s_or_b32 s13, s13, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 36, v0
-; SI-NEXT:    v_or_b32_e32 v6, s4, v6
-; SI-NEXT:    s_and_b32 s4, s27, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s43, 16
-; SI-NEXT:    buffer_store_dword v8, v7, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 40, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    s_and_b32 s4, s28, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_mov_b32_e32 v2, s13
+; SI-NEXT:    s_and_b32 s13, s23, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s91, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 24, v0
+; SI-NEXT:    s_or_b32 s13, s13, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 44, v0
-; SI-NEXT:    v_or_b32_e32 v5, s4, v5
-; SI-NEXT:    s_and_b32 s4, s29, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s42, 16
-; SI-NEXT:    buffer_store_dword v7, v6, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v6, s4
-; SI-NEXT:    s_and_b32 s4, s13, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_mov_b32_e32 v2, s13
+; SI-NEXT:    s_and_b32 s13, s24, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s56, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s13, s13, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 52, v0
-; SI-NEXT:    v_or_b32_e32 v4, s4, v4
-; SI-NEXT:    s_and_b32 s4, s12, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s41, 16
-; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 56, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v5, s4
-; SI-NEXT:    s_and_b32 s4, s11, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_mov_b32_e32 v2, s13
+; SI-NEXT:    s_and_b32 s13, s25, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s90, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v0
+; SI-NEXT:    s_or_b32 s13, s13, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 60, v0
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s10, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s40, 16
-; SI-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 64, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v4, s4
-; SI-NEXT:    s_and_b32 s4, s9, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_mov_b32_e32 v2, s13
+; SI-NEXT:    s_and_b32 s13, s26, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s46, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 36, v0
+; SI-NEXT:    s_or_b32 s13, s13, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x44, v0
-; SI-NEXT:    v_or_b32_e32 v2, s4, v2
-; SI-NEXT:    s_and_b32 s4, s8, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s15, 16
-; SI-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v3, s4
-; SI-NEXT:    s_and_b32 s4, s7, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_mov_b32_e32 v2, s13
+; SI-NEXT:    s_and_b32 s13, s27, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s89, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 40, v0
+; SI-NEXT:    s_or_b32 s13, s13, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
-; SI-NEXT:    v_or_b32_e32 v1, s4, v1
-; SI-NEXT:    s_and_b32 s4, s6, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s14, 16
-; SI-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x50, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s13
+; SI-NEXT:    s_and_b32 s13, s28, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s44, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 44, v0
+; SI-NEXT:    s_or_b32 s13, s13, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s13
+; SI-NEXT:    s_and_b32 s13, s29, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s88, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 48, v0
+; SI-NEXT:    s_or_b32 s13, s13, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s13
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s42, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 52, v0
+; SI-NEXT:    s_or_b32 s10, s10, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s10, s11, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s79, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 56, v0
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    s_lshl_b32 s10, s40, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 60, v0
+; SI-NEXT:    s_or_b32 s8, s8, s10
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s8, s9, 0xffff
+; SI-NEXT:    s_lshl_b32 s9, s78, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 64, v0
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_lshl_b32 s8, s14, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x44, v0
+; SI-NEXT:    s_or_b32 s6, s6, s8
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s6, s7, 0xffff
+; SI-NEXT:    s_lshl_b32 s7, s77, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x48, v0
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; SI-NEXT:    s_lshl_b32 s6, s12, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x4c, v0
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    s_and_b32 s4, s5, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s76, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x50, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x54, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB41_4:
-; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $sgpr72
+; SI-NEXT:    ; implicit-def: $sgpr94
+; SI-NEXT:    ; implicit-def: $sgpr62
+; SI-NEXT:    ; implicit-def: $sgpr93
+; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr92
+; SI-NEXT:    ; implicit-def: $sgpr58
+; SI-NEXT:    ; implicit-def: $sgpr91
 ; SI-NEXT:    ; implicit-def: $sgpr56
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $sgpr47
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $sgpr90
 ; SI-NEXT:    ; implicit-def: $sgpr46
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $sgpr45
-; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $sgpr89
 ; SI-NEXT:    ; implicit-def: $sgpr44
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $sgpr43
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $sgpr88
 ; SI-NEXT:    ; implicit-def: $sgpr42
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $sgpr41
-; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $sgpr79
 ; SI-NEXT:    ; implicit-def: $sgpr40
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $sgpr15
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr78
 ; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr77
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr76
 ; SI-NEXT:    s_branch .LBB41_2
 ;
 ; VI-LABEL: bitcast_v11i64_to_v44i16_scalar:
@@ -26272,131 +26264,131 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a,
 ; SI-NEXT:    v_mov_b32_e32 v10, s29
 ; SI-NEXT:    s_cbranch_scc0 .LBB49_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_alignbit_b32 v23, v8, v7, 16
-; SI-NEXT:    v_alignbit_b32 v24, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v25, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v26, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v27, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v29, v12, v11, 16
-; SI-NEXT:    v_alignbit_b32 v32, v14, v13, 16
-; SI-NEXT:    v_alignbit_b32 v34, v16, v15, 16
-; SI-NEXT:    v_alignbit_b32 v36, v18, v17, 16
-; SI-NEXT:    v_alignbit_b32 v39, v20, v19, 16
-; SI-NEXT:    v_alignbit_b32 v49, v22, v21, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v22
+; SI-NEXT:    v_lshr_b64 v[23:24], v[7:8], 16
+; SI-NEXT:    v_lshr_b64 v[24:25], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[25:26], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[26:27], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[31:32], v[11:12], 16
+; SI-NEXT:    v_lshr_b64 v[27:28], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[32:33], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[28:29], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[33:34], v[19:20], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v22
+; SI-NEXT:    v_lshr_b64 v[29:30], v[15:16], 16
+; SI-NEXT:    v_lshr_b64 v[34:35], v[21:22], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB49_3
 ; SI-NEXT:  .LBB49_2: ; %cmp.true
-; SI-NEXT:    v_add_f64 v[21:22], v[21:22], 1.0
-; SI-NEXT:    v_add_f64 v[19:20], v[19:20], 1.0
-; SI-NEXT:    v_add_f64 v[17:18], v[17:18], 1.0
-; SI-NEXT:    v_add_f64 v[15:16], v[15:16], 1.0
-; SI-NEXT:    v_add_f64 v[13:14], v[13:14], 1.0
-; SI-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
-; SI-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
-; SI-NEXT:    v_add_f64 v[1:2], v[1:2], 1.0
-; SI-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
 ; SI-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
 ; SI-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
-; SI-NEXT:    v_alignbit_b32 v23, v8, v7, 16
-; SI-NEXT:    v_alignbit_b32 v24, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v25, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v26, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v27, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v29, v12, v11, 16
-; SI-NEXT:    v_alignbit_b32 v32, v14, v13, 16
-; SI-NEXT:    v_alignbit_b32 v34, v16, v15, 16
-; SI-NEXT:    v_alignbit_b32 v36, v18, v17, 16
-; SI-NEXT:    v_alignbit_b32 v39, v20, v19, 16
-; SI-NEXT:    v_alignbit_b32 v49, v22, v21, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v28, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v31, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v22
+; SI-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
+; SI-NEXT:    v_lshr_b64 v[23:24], v[7:8], 16
+; SI-NEXT:    v_add_f64 v[1:2], v[1:2], 1.0
+; SI-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
+; SI-NEXT:    v_lshr_b64 v[24:25], v[5:6], 16
+; SI-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
+; SI-NEXT:    v_add_f64 v[17:18], v[17:18], 1.0
+; SI-NEXT:    v_lshr_b64 v[25:26], v[3:4], 16
+; SI-NEXT:    v_add_f64 v[13:14], v[13:14], 1.0
+; SI-NEXT:    v_add_f64 v[19:20], v[19:20], 1.0
+; SI-NEXT:    v_lshr_b64 v[26:27], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[31:32], v[11:12], 16
+; SI-NEXT:    v_add_f64 v[15:16], v[15:16], 1.0
+; SI-NEXT:    v_add_f64 v[21:22], v[21:22], 1.0
+; SI-NEXT:    v_lshr_b64 v[27:28], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[32:33], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[28:29], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[33:34], v[19:20], 16
+; SI-NEXT:    v_lshr_b64 v[29:30], v[15:16], 16
+; SI-NEXT:    v_lshr_b64 v[34:35], v[21:22], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v22
 ; SI-NEXT:  .LBB49_3: ; %end
-; SI-NEXT:    v_lshlrev_b32_e32 v49, 16, v49
+; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v34
 ; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; SI-NEXT:    v_or_b32_e32 v21, v21, v49
+; SI-NEXT:    v_or_b32_e32 v21, v21, v30
 ; SI-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v52
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v54
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v22
 ; SI-NEXT:    v_add_i32_e32 v22, vcc, 4, v0
 ; SI-NEXT:    buffer_store_dword v21, v22, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v39
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v33
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v21
 ; SI-NEXT:    v_add_i32_e32 v21, vcc, 8, v0
 ; SI-NEXT:    buffer_store_dword v19, v21, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v51
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v53
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v20
 ; SI-NEXT:    v_add_i32_e32 v20, vcc, 12, v0
 ; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v36
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v32
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
 ; SI-NEXT:    v_or_b32_e32 v17, v17, v19
 ; SI-NEXT:    v_add_i32_e32 v19, vcc, 16, v0
 ; SI-NEXT:    buffer_store_dword v17, v19, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v50
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v52
 ; SI-NEXT:    v_or_b32_e32 v17, v17, v18
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v29
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v17
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v15, v17, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v48
+; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v51
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v16
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v32
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v28
 ; SI-NEXT:    v_or_b32_e32 v13, v13, v15
 ; SI-NEXT:    v_add_i32_e32 v15, vcc, 32, v0
 ; SI-NEXT:    buffer_store_dword v13, v15, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v38
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v50
 ; SI-NEXT:    v_or_b32_e32 v13, v13, v14
 ; SI-NEXT:    v_add_i32_e32 v14, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v29
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v31
 ; SI-NEXT:    v_or_b32_e32 v11, v11, v13
 ; SI-NEXT:    v_add_i32_e32 v13, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v11, v13, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v37
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v49
 ; SI-NEXT:    v_or_b32_e32 v11, v11, v12
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v11, v12, s[0:3], 0 offen
@@ -26408,7 +26400,7 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v9, v11, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v35
+; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v48
 ; SI-NEXT:    v_or_b32_e32 v9, v9, v10
 ; SI-NEXT:    v_add_i32_e32 v10, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
@@ -26420,7 +26412,7 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v9, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v33
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v39
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -26432,7 +26424,7 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v31
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v38
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -26444,7 +26436,7 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v30
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v37
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -26456,35 +26448,35 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v28
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v36
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB49_4:
-; SI-NEXT:    ; implicit-def: $vgpr49
+; SI-NEXT:    ; implicit-def: $vgpr34
+; SI-NEXT:    ; implicit-def: $vgpr54
+; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr53
+; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr52
-; SI-NEXT:    ; implicit-def: $vgpr39
+; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    ; implicit-def: $vgpr51
-; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    ; implicit-def: $vgpr28
 ; SI-NEXT:    ; implicit-def: $vgpr50
-; SI-NEXT:    ; implicit-def: $vgpr34
+; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; implicit-def: $vgpr48
-; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr38
-; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    ; implicit-def: $vgpr37
+; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr31
 ; SI-NEXT:    ; implicit-def: $vgpr24
-; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr28
 ; SI-NEXT:    s_branch .LBB49_2
 ;
 ; VI-LABEL: bitcast_v11f64_to_v44i16_scalar:
@@ -35515,368 +35507,413 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i
 ; SI-LABEL: bitcast_v44f16_to_v44i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v38, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v51, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v31, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v20
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(6)
+; SI-NEXT:    v_cvt_f16_f32_e32 v57, v2
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v47, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v44, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v46, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v42, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v45, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v41, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v43, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v55, v22
+; SI-NEXT:    v_cvt_f16_f32_e32 v35, v23
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v26
+; SI-NEXT:    v_cvt_f16_f32_e32 v33, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v30
+; SI-NEXT:    v_cvt_f16_f32_e32 v40, s16
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v63, s19
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, s20
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, s23
+; SI-NEXT:    v_cvt_f16_f32_e32 v62, s24
+; SI-NEXT:    v_cvt_f16_f32_e32 v59, s27
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, s28
+; SI-NEXT:    v_cvt_f16_f32_e32 v22, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v30, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v27, v20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v23
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v25
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v26
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v27
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, v28
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v29
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v30
-; SI-NEXT:    v_cvt_f16_f32_e32 v27, s16
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, s17
-; SI-NEXT:    v_cvt_f16_f32_e32 v24, s18
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, s19
-; SI-NEXT:    v_cvt_f16_f32_e32 v26, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v30, s21
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, s22
-; SI-NEXT:    v_cvt_f16_f32_e32 v22, s23
-; SI-NEXT:    v_cvt_f16_f32_e32 v23, s24
-; SI-NEXT:    v_cvt_f16_f32_e32 v29, s25
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, s26
-; SI-NEXT:    v_cvt_f16_f32_e32 v19, s27
-; SI-NEXT:    v_cvt_f16_f32_e32 v20, s28
-; SI-NEXT:    v_cvt_f16_f32_e32 v28, s29
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v53
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, v24
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v25
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v29
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v26, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v48, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, s22
+; SI-NEXT:    v_cvt_f16_f32_e32 v38, s25
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, s26
+; SI-NEXT:    v_cvt_f16_f32_e32 v29, s29
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_cbranch_scc0 .LBB59_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_cbranch_execnz .LBB59_3
 ; SI-NEXT:  .LBB59_2: ; %cmp.true
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v63
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v26
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v50
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v63, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v32
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v63
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_or_b32_e32 v26, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v61
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v24
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v48
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, v5
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v35
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
-; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v34, v34
-; SI-NEXT:    v_cvt_f32_f16_e32 v38, v38
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
-; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v37
-; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; SI-NEXT:    v_lshlrev_b32_e32 v53, 16, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
-; SI-NEXT:    v_add_f32_e32 v32, 0x38000000, v32
-; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
-; SI-NEXT:    v_or_b32_e32 v1, v1, v53
-; SI-NEXT:    v_lshlrev_b32_e32 v53, 16, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
-; SI-NEXT:    v_add_f32_e32 v31, 0x38000000, v31
-; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v35
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
-; SI-NEXT:    v_or_b32_e32 v3, v3, v53
-; SI-NEXT:    v_lshlrev_b32_e32 v53, 16, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
-; SI-NEXT:    v_add_f32_e32 v34, 0x38000000, v34
-; SI-NEXT:    v_add_f32_e32 v38, 0x38000000, v38
-; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
-; SI-NEXT:    v_cvt_f32_f16_e32 v52, v52
-; SI-NEXT:    v_cvt_f32_f16_e32 v30, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v29, v29
-; SI-NEXT:    v_cvt_f32_f16_e32 v28, v28
-; SI-NEXT:    v_cvt_f32_f16_e32 v51, v51
-; SI-NEXT:    v_cvt_f32_f16_e32 v50, v50
-; SI-NEXT:    v_cvt_f32_f16_e32 v49, v49
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v48
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; SI-NEXT:    v_or_b32_e32 v6, v6, v53
-; SI-NEXT:    v_lshlrev_b32_e32 v53, 16, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v34
-; SI-NEXT:    v_cvt_f16_f32_e32 v38, v38
-; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v61
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_or_b32_e32 v24, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v59
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v38
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v59, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v59
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_or_b32_e32 v16, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v57
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v29
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v57, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v57
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_or_b32_e32 v22, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v47
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v36
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v47, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v47
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_or_b32_e32 v18, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v44
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v34
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v44, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v44
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_or_b32_e32 v14, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v42
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v32
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v42, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v42
+; SI-NEXT:    v_or_b32_e32 v10, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v41
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v30
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v41, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v41
+; SI-NEXT:    v_or_b32_e32 v8, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v55
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v27
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v55, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v55
+; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
+; SI-NEXT:    v_or_b32_e32 v6, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v53
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v19
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v4
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v53
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_or_b32_e32 v4, v3, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v52
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v54
+; SI-NEXT:    v_cvt_f32_f16_e32 v19, v60
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v33
+; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v11
+; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
 ; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
-; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
-; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
-; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
-; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
-; SI-NEXT:    v_cvt_f32_f16_e32 v39, v39
-; SI-NEXT:    v_cvt_f32_f16_e32 v36, v36
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, v33
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    v_or_b32_e32 v9, v9, v53
-; SI-NEXT:    v_lshlrev_b32_e32 v53, 16, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v52
+; SI-NEXT:    v_or_b32_e32 v2, v2, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v40
 ; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
-; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v22
-; SI-NEXT:    v_or_b32_e32 v13, v13, v53
-; SI-NEXT:    v_lshlrev_b32_e32 v53, 16, v32
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
-; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
-; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v25
-; SI-NEXT:    v_or_b32_e32 v31, v31, v53
-; SI-NEXT:    v_lshlrev_b32_e32 v53, 16, v35
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
-; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v24
-; SI-NEXT:    v_add_f32_e32 v52, 0x38000000, v52
-; SI-NEXT:    v_add_f32_e32 v30, 0x38000000, v30
-; SI-NEXT:    v_add_f32_e32 v29, 0x38000000, v29
-; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
-; SI-NEXT:    v_add_f32_e32 v51, 0x38000000, v51
-; SI-NEXT:    v_add_f32_e32 v50, 0x38000000, v50
-; SI-NEXT:    v_add_f32_e32 v49, 0x38000000, v49
-; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v48
-; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
-; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
+; SI-NEXT:    v_or_b32_e32 v54, v12, v23
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v58
+; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_or_b32_e32 v60, v19, v21
+; SI-NEXT:    v_cvt_f32_f16_e32 v19, v46
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
-; SI-NEXT:    v_or_b32_e32 v34, v34, v53
-; SI-NEXT:    v_lshlrev_b32_e32 v53, 16, v38
-; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
-; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
-; SI-NEXT:    v_cvt_f16_f32_e32 v30, v30
-; SI-NEXT:    v_add_f32_e32 v26, 0x38000000, v26
-; SI-NEXT:    v_cvt_f16_f32_e32 v29, v29
-; SI-NEXT:    v_add_f32_e32 v23, 0x38000000, v23
-; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
-; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
-; SI-NEXT:    v_cvt_f16_f32_e32 v51, v51
-; SI-NEXT:    v_add_f32_e32 v39, 0x38000000, v39
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
-; SI-NEXT:    v_add_f32_e32 v36, 0x38000000, v36
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, v49
-; SI-NEXT:    v_add_f32_e32 v33, 0x38000000, v33
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; SI-NEXT:    v_or_b32_e32 v40, v11, v25
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v62
+; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
 ; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_or_b32_e32 v58, v12, v17
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v45
+; SI-NEXT:    v_or_b32_e32 v46, v19, v9
+; SI-NEXT:    v_or_b32_e32 v62, v11, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v56
+; SI-NEXT:    v_cvt_f32_f16_e32 v19, v35
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
+; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
+; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
+; SI-NEXT:    v_or_b32_e32 v56, v11, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v43
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT:    v_or_b32_e32 v37, v37, v53
-; SI-NEXT:    v_lshlrev_b32_e32 v53, 16, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
-; SI-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v36
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v33
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
 ; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT:    v_or_b32_e32 v18, v18, v53
-; SI-NEXT:    v_lshlrev_b32_e32 v53, 16, v22
-; SI-NEXT:    v_or_b32_e32 v21, v21, v53
-; SI-NEXT:    v_lshlrev_b32_e32 v53, 16, v25
-; SI-NEXT:    v_or_b32_e32 v24, v24, v53
-; SI-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
-; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
-; SI-NEXT:    v_lshlrev_b32_e32 v49, 16, v49
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v48
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; SI-NEXT:    v_or_b32_e32 v27, v27, v52
-; SI-NEXT:    v_or_b32_e32 v26, v26, v30
-; SI-NEXT:    v_or_b32_e32 v23, v23, v29
-; SI-NEXT:    v_or_b32_e32 v20, v20, v28
-; SI-NEXT:    v_or_b32_e32 v39, v39, v51
-; SI-NEXT:    v_or_b32_e32 v36, v36, v50
-; SI-NEXT:    v_or_b32_e32 v33, v33, v49
-; SI-NEXT:    v_or_b32_e32 v15, v15, v48
-; SI-NEXT:    v_or_b32_e32 v11, v11, v17
-; SI-NEXT:    v_or_b32_e32 v8, v8, v16
-; SI-NEXT:    v_or_b32_e32 v5, v5, v12
-; SI-NEXT:    v_alignbit_b32 v52, v24, v52, 16
-; SI-NEXT:    v_alignbit_b32 v30, v21, v30, 16
-; SI-NEXT:    v_alignbit_b32 v29, v18, v29, 16
-; SI-NEXT:    v_alignbit_b32 v28, v37, v28, 16
-; SI-NEXT:    v_alignbit_b32 v51, v34, v51, 16
-; SI-NEXT:    v_alignbit_b32 v50, v31, v50, 16
-; SI-NEXT:    v_alignbit_b32 v49, v13, v49, 16
-; SI-NEXT:    v_alignbit_b32 v48, v9, v48, 16
-; SI-NEXT:    v_alignbit_b32 v17, v6, v17, 16
-; SI-NEXT:    v_alignbit_b32 v16, v3, v16, 16
-; SI-NEXT:    v_alignbit_b32 v12, v1, v12, 16
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v45, v12, v7
+; SI-NEXT:    v_or_b32_e32 v12, v19, v3
+; SI-NEXT:    v_or_b32_e32 v43, v11, v5
+; SI-NEXT:    v_or_b32_e32 v11, v20, v1
+; SI-NEXT:    v_lshr_b64 v[29:30], v[21:22], 16
+; SI-NEXT:    v_lshr_b64 v[34:35], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[32:33], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[50:51], v[25:26], 16
+; SI-NEXT:    v_lshr_b64 v[48:49], v[23:24], 16
+; SI-NEXT:    v_lshr_b64 v[38:39], v[15:16], 16
+; SI-NEXT:    v_lshr_b64 v[36:37], v[17:18], 16
+; SI-NEXT:    v_mov_b32_e32 v35, v12
+; SI-NEXT:    v_mov_b32_e32 v33, v11
+; SI-NEXT:    v_lshr_b64 v[30:31], v[7:8], 16
+; SI-NEXT:    v_lshr_b64 v[27:28], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[19:20], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[11:12], v[1:2], 16
 ; SI-NEXT:  .LBB59_3: ; %end
-; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; SI-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
-; SI-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; SI-NEXT:    v_or_b32_e32 v27, v27, v52
-; SI-NEXT:    v_or_b32_e32 v24, v24, v25
-; SI-NEXT:    v_add_i32_e32 v25, vcc, 4, v0
-; SI-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v24, v25, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v24, 0xffff, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v30
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v21
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; SI-NEXT:    v_or_b32_e32 v24, v24, v25
-; SI-NEXT:    v_add_i32_e32 v25, vcc, 8, v0
-; SI-NEXT:    v_or_b32_e32 v21, v21, v22
-; SI-NEXT:    v_add_i32_e32 v22, vcc, 12, v0
-; SI-NEXT:    buffer_store_dword v24, v25, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v21, v22, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v50
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v40
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v23
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v29
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; SI-NEXT:    v_or_b32_e32 v21, v21, v22
-; SI-NEXT:    v_add_i32_e32 v22, vcc, 16, v0
-; SI-NEXT:    v_or_b32_e32 v18, v18, v19
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 20, v0
-; SI-NEXT:    buffer_store_dword v21, v22, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v28
-; SI-NEXT:    v_or_b32_e32 v18, v18, v19
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 24, v0
-; SI-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v38
-; SI-NEXT:    v_or_b32_e32 v18, v18, v19
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 28, v0
-; SI-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v39
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v51
-; SI-NEXT:    v_or_b32_e32 v18, v18, v19
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 32, v0
-; SI-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v34
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v35
-; SI-NEXT:    v_or_b32_e32 v18, v18, v19
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 36, v0
-; SI-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v36
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v50
-; SI-NEXT:    v_or_b32_e32 v18, v18, v19
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 40, v0
-; SI-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v31
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v32
-; SI-NEXT:    v_or_b32_e32 v18, v18, v19
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 44, v0
-; SI-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v26
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v63
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 4, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v33
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v49
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT:    v_or_b32_e32 v18, v18, v19
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 48, v0
-; SI-NEXT:    v_or_b32_e32 v13, v13, v14
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 52, v0
-; SI-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v48
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v54
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 8, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v48
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT:    v_or_b32_e32 v13, v13, v14
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 56, v0
-; SI-NEXT:    v_or_b32_e32 v9, v9, v10
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 60, v0
-; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v24
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v61
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v17
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_or_b32_e32 v9, v9, v10
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 64, v0
-; SI-NEXT:    v_or_b32_e32 v6, v6, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x44, v0
-; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v38
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v62
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v16
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_or_b32_e32 v6, v6, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x48, v0
-; SI-NEXT:    v_or_b32_e32 v3, v3, v4
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x4c, v0
-; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v59
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v12
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v3, v3, v4
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x50, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v29
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v60
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v57
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v58
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v36
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v47
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 36, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v56
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v34
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 40, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v44
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 44, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v46
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v32
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v42
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v45
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v30
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v41
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v43
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v27
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 64, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v55
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x44, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v35
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v19
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x48, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v53
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v33
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v11
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x50, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v52
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x54, v0
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB59_4:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
index d7faaa68732d2..c35e183fa787f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
@@ -3189,289 +3189,301 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3
 ; SI-LABEL: bitcast_v24i32_to_v48i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_writelane_b32 v12, s30, 0
+; SI-NEXT:    v_writelane_b32 v12, s31, 1
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; SI-NEXT:    v_readfirstlane_b32 s15, v1
-; SI-NEXT:    v_readfirstlane_b32 s14, v2
-; SI-NEXT:    v_readfirstlane_b32 s13, v3
-; SI-NEXT:    v_readfirstlane_b32 s12, v4
-; SI-NEXT:    v_readfirstlane_b32 s11, v5
-; SI-NEXT:    v_readfirstlane_b32 s10, v6
-; SI-NEXT:    v_readfirstlane_b32 s9, v7
-; SI-NEXT:    v_readfirstlane_b32 s8, v8
-; SI-NEXT:    v_readfirstlane_b32 s7, v9
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_readfirstlane_b32 s6, v10
+; SI-NEXT:    v_writelane_b32 v12, s34, 2
+; SI-NEXT:    v_readfirstlane_b32 s12, v1
+; SI-NEXT:    v_readfirstlane_b32 s13, v2
+; SI-NEXT:    v_readfirstlane_b32 s10, v3
+; SI-NEXT:    v_readfirstlane_b32 s11, v4
+; SI-NEXT:    v_readfirstlane_b32 s8, v5
+; SI-NEXT:    v_readfirstlane_b32 s9, v6
+; SI-NEXT:    v_readfirstlane_b32 s6, v7
+; SI-NEXT:    v_readfirstlane_b32 s7, v8
+; SI-NEXT:    v_readfirstlane_b32 s4, v9
+; SI-NEXT:    s_and_b64 s[14:15], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s5, v10
+; SI-NEXT:    v_writelane_b32 v12, s35, 3
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s13
-; SI-NEXT:    v_mov_b32_e32 v5, s15
-; SI-NEXT:    v_mov_b32_e32 v6, s28
-; SI-NEXT:    v_mov_b32_e32 v7, s26
-; SI-NEXT:    v_mov_b32_e32 v8, s24
-; SI-NEXT:    v_mov_b32_e32 v9, s22
-; SI-NEXT:    v_mov_b32_e32 v10, s20
-; SI-NEXT:    v_mov_b32_e32 v11, s18
-; SI-NEXT:    v_mov_b32_e32 v12, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s10, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s12, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s14, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s29, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s27, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s25, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s23, v9, 16
-; SI-NEXT:    v_alignbit_b32 v10, s21, v10, 16
-; SI-NEXT:    v_alignbit_b32 v11, s19, v11, 16
-; SI-NEXT:    v_alignbit_b32 v12, s17, v12, 16
-; SI-NEXT:    s_lshr_b32 s40, s6, 16
-; SI-NEXT:    s_lshr_b32 s41, s8, 16
-; SI-NEXT:    s_lshr_b32 s42, s10, 16
-; SI-NEXT:    s_lshr_b32 s43, s12, 16
-; SI-NEXT:    s_lshr_b32 s44, s14, 16
-; SI-NEXT:    s_lshr_b32 s45, s29, 16
-; SI-NEXT:    s_lshr_b32 s46, s27, 16
-; SI-NEXT:    s_lshr_b32 s47, s25, 16
-; SI-NEXT:    s_lshr_b32 s56, s23, 16
-; SI-NEXT:    s_lshr_b32 s57, s21, 16
-; SI-NEXT:    s_lshr_b32 s58, s19, 16
-; SI-NEXT:    s_lshr_b32 s59, s17, 16
+; SI-NEXT:    s_lshr_b32 s88, s5, 16
+; SI-NEXT:    s_lshr_b32 s89, s7, 16
+; SI-NEXT:    s_lshr_b32 s90, s9, 16
+; SI-NEXT:    s_lshr_b32 s91, s11, 16
+; SI-NEXT:    s_lshr_b32 s92, s13, 16
+; SI-NEXT:    s_lshr_b32 s93, s29, 16
+; SI-NEXT:    s_lshr_b32 s94, s27, 16
+; SI-NEXT:    s_lshr_b32 s95, s25, 16
+; SI-NEXT:    s_lshr_b32 s30, s23, 16
+; SI-NEXT:    s_lshr_b32 s31, s21, 16
+; SI-NEXT:    s_lshr_b32 s34, s19, 16
+; SI-NEXT:    s_lshr_b32 s35, s17, 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[72:73], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[74:75], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB13_3
 ; SI-NEXT:  .LBB13_2: ; %cmp.true
+; SI-NEXT:    s_add_i32 s17, s17, 3
 ; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    s_add_i32 s19, s19, 3
 ; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_add_i32 s21, s21, 3
 ; SI-NEXT:    s_add_i32 s20, s20, 3
+; SI-NEXT:    s_add_i32 s23, s23, 3
 ; SI-NEXT:    s_add_i32 s22, s22, 3
+; SI-NEXT:    s_add_i32 s25, s25, 3
 ; SI-NEXT:    s_add_i32 s24, s24, 3
+; SI-NEXT:    s_add_i32 s27, s27, 3
 ; SI-NEXT:    s_add_i32 s26, s26, 3
+; SI-NEXT:    s_add_i32 s29, s29, 3
 ; SI-NEXT:    s_add_i32 s28, s28, 3
-; SI-NEXT:    s_add_i32 s15, s15, 3
 ; SI-NEXT:    s_add_i32 s13, s13, 3
-; SI-NEXT:    s_add_i32 s11, s11, 3
-; SI-NEXT:    s_add_i32 s9, s9, 3
-; SI-NEXT:    s_add_i32 s7, s7, 3
-; SI-NEXT:    s_add_i32 s17, s17, 3
-; SI-NEXT:    s_add_i32 s19, s19, 3
-; SI-NEXT:    s_add_i32 s21, s21, 3
-; SI-NEXT:    s_add_i32 s23, s23, 3
-; SI-NEXT:    s_add_i32 s25, s25, 3
-; SI-NEXT:    s_add_i32 s27, s27, 3
-; SI-NEXT:    s_add_i32 s29, s29, 3
-; SI-NEXT:    s_add_i32 s14, s14, 3
 ; SI-NEXT:    s_add_i32 s12, s12, 3
+; SI-NEXT:    s_add_i32 s11, s11, 3
 ; SI-NEXT:    s_add_i32 s10, s10, 3
+; SI-NEXT:    s_add_i32 s9, s9, 3
 ; SI-NEXT:    s_add_i32 s8, s8, 3
+; SI-NEXT:    s_add_i32 s7, s7, 3
 ; SI-NEXT:    s_add_i32 s6, s6, 3
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s13
-; SI-NEXT:    v_mov_b32_e32 v5, s15
-; SI-NEXT:    v_mov_b32_e32 v6, s28
-; SI-NEXT:    v_mov_b32_e32 v7, s26
-; SI-NEXT:    v_mov_b32_e32 v8, s24
-; SI-NEXT:    v_mov_b32_e32 v9, s22
-; SI-NEXT:    v_mov_b32_e32 v10, s20
-; SI-NEXT:    v_mov_b32_e32 v11, s18
-; SI-NEXT:    v_mov_b32_e32 v12, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s10, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s12, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s14, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s29, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s27, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s25, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s23, v9, 16
-; SI-NEXT:    v_alignbit_b32 v10, s21, v10, 16
-; SI-NEXT:    v_alignbit_b32 v11, s19, v11, 16
-; SI-NEXT:    v_alignbit_b32 v12, s17, v12, 16
-; SI-NEXT:    s_lshr_b32 s40, s6, 16
-; SI-NEXT:    s_lshr_b32 s41, s8, 16
-; SI-NEXT:    s_lshr_b32 s42, s10, 16
-; SI-NEXT:    s_lshr_b32 s43, s12, 16
-; SI-NEXT:    s_lshr_b32 s44, s14, 16
-; SI-NEXT:    s_lshr_b32 s45, s29, 16
-; SI-NEXT:    s_lshr_b32 s46, s27, 16
-; SI-NEXT:    s_lshr_b32 s47, s25, 16
-; SI-NEXT:    s_lshr_b32 s56, s23, 16
-; SI-NEXT:    s_lshr_b32 s57, s21, 16
-; SI-NEXT:    s_lshr_b32 s58, s19, 16
-; SI-NEXT:    s_lshr_b32 s59, s17, 16
+; SI-NEXT:    s_add_i32 s5, s5, 3
+; SI-NEXT:    s_add_i32 s4, s4, 3
+; SI-NEXT:    s_lshr_b64 s[14:15], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[24:25], 16
+; SI-NEXT:    s_lshr_b32 s88, s5, 16
+; SI-NEXT:    s_lshr_b32 s89, s7, 16
+; SI-NEXT:    s_lshr_b32 s90, s9, 16
+; SI-NEXT:    s_lshr_b32 s91, s11, 16
+; SI-NEXT:    s_lshr_b32 s92, s13, 16
+; SI-NEXT:    s_lshr_b32 s93, s29, 16
+; SI-NEXT:    s_lshr_b32 s94, s27, 16
+; SI-NEXT:    s_lshr_b32 s95, s25, 16
+; SI-NEXT:    s_lshr_b32 s30, s23, 16
+; SI-NEXT:    s_lshr_b32 s31, s21, 16
+; SI-NEXT:    s_lshr_b32 s34, s19, 16
+; SI-NEXT:    s_lshr_b32 s35, s17, 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[72:73], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[74:75], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[16:17], 16
 ; SI-NEXT:  .LBB13_3: ; %end
-; SI-NEXT:    s_and_b32 s4, s16, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; SI-NEXT:    v_or_b32_e32 v12, s4, v12
-; SI-NEXT:    s_and_b32 s4, s17, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s59, 16
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_mov_b32_e32 v13, s4
-; SI-NEXT:    s_and_b32 s4, s18, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 4, v0
-; SI-NEXT:    v_or_b32_e32 v11, s4, v11
-; SI-NEXT:    s_and_b32 s4, s19, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s58, 16
-; SI-NEXT:    buffer_store_dword v13, v12, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 8, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v11, v12, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v12, s4
-; SI-NEXT:    s_and_b32 s4, s20, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    s_lshl_b32 s15, s76, 16
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s15, s16, s15
+; SI-NEXT:    v_mov_b32_e32 v1, s15
+; SI-NEXT:    s_and_b32 s15, s17, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s35, 16
+; SI-NEXT:    s_or_b32 s15, s15, s16
+; SI-NEXT:    v_mov_b32_e32 v2, s15
+; SI-NEXT:    s_lshl_b32 s15, s74, 16
+; SI-NEXT:    s_and_b32 s16, s18, 0xffff
+; SI-NEXT:    s_or_b32 s15, s16, s15
+; SI-NEXT:    v_mov_b32_e32 v3, s15
+; SI-NEXT:    s_and_b32 s15, s19, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s34, 16
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 12, v0
-; SI-NEXT:    v_or_b32_e32 v10, s4, v10
-; SI-NEXT:    s_and_b32 s4, s21, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s57, 16
-; SI-NEXT:    buffer_store_dword v12, v11, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 16, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v11, s4
-; SI-NEXT:    s_and_b32 s4, s22, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; SI-NEXT:    s_or_b32 s15, s15, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s15
+; SI-NEXT:    s_and_b32 s15, s20, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s72, 16
+; SI-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; SI-NEXT:    s_or_b32 s15, s15, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 20, v0
-; SI-NEXT:    v_or_b32_e32 v9, s4, v9
-; SI-NEXT:    s_and_b32 s4, s23, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s56, 16
-; SI-NEXT:    buffer_store_dword v11, v10, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 24, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v10, s4
-; SI-NEXT:    s_and_b32 s4, s24, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT:    v_mov_b32_e32 v2, s15
+; SI-NEXT:    s_and_b32 s15, s21, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s31, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; SI-NEXT:    s_or_b32 s15, s15, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 28, v0
-; SI-NEXT:    v_or_b32_e32 v8, s4, v8
-; SI-NEXT:    s_and_b32 s4, s25, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s47, 16
-; SI-NEXT:    buffer_store_dword v10, v9, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 32, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v8, v9, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v9, s4
-; SI-NEXT:    s_and_b32 s4, s26, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_mov_b32_e32 v2, s15
+; SI-NEXT:    s_and_b32 s15, s22, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s62, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 20, v0
+; SI-NEXT:    s_or_b32 s15, s15, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 36, v0
-; SI-NEXT:    v_or_b32_e32 v7, s4, v7
-; SI-NEXT:    s_and_b32 s4, s27, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s46, 16
-; SI-NEXT:    buffer_store_dword v9, v8, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 40, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v8, s4
-; SI-NEXT:    s_and_b32 s4, s28, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT:    v_mov_b32_e32 v2, s15
+; SI-NEXT:    s_and_b32 s15, s23, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s30, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 24, v0
+; SI-NEXT:    s_or_b32 s15, s15, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 44, v0
-; SI-NEXT:    v_or_b32_e32 v6, s4, v6
-; SI-NEXT:    s_and_b32 s4, s29, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s45, 16
-; SI-NEXT:    buffer_store_dword v8, v7, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    s_and_b32 s4, s15, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_mov_b32_e32 v2, s15
+; SI-NEXT:    s_and_b32 s15, s24, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s60, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s15, s15, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 52, v0
-; SI-NEXT:    v_or_b32_e32 v5, s4, v5
-; SI-NEXT:    s_and_b32 s4, s14, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s44, 16
-; SI-NEXT:    buffer_store_dword v7, v6, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 56, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v6, s4
-; SI-NEXT:    s_and_b32 s4, s13, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_mov_b32_e32 v2, s15
+; SI-NEXT:    s_and_b32 s15, s25, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s95, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v0
+; SI-NEXT:    s_or_b32 s15, s15, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 60, v0
-; SI-NEXT:    v_or_b32_e32 v4, s4, v4
-; SI-NEXT:    s_and_b32 s4, s12, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s43, 16
-; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 64, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v5, s4
-; SI-NEXT:    s_and_b32 s4, s11, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_mov_b32_e32 v2, s15
+; SI-NEXT:    s_and_b32 s15, s26, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s58, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 36, v0
+; SI-NEXT:    s_or_b32 s15, s15, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x44, v0
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s10, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s42, 16
-; SI-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v4, s4
-; SI-NEXT:    s_and_b32 s4, s9, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_mov_b32_e32 v2, s15
+; SI-NEXT:    s_and_b32 s15, s27, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s94, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 40, v0
+; SI-NEXT:    s_or_b32 s15, s15, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
-; SI-NEXT:    v_or_b32_e32 v2, s4, v2
-; SI-NEXT:    s_and_b32 s4, s8, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s41, 16
-; SI-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x50, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v3, s4
-; SI-NEXT:    s_and_b32 s4, s7, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_mov_b32_e32 v2, s15
+; SI-NEXT:    s_and_b32 s15, s28, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s56, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 44, v0
+; SI-NEXT:    s_or_b32 s15, s15, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
-; SI-NEXT:    v_or_b32_e32 v1, s4, v1
-; SI-NEXT:    s_and_b32 s4, s6, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s40, 16
-; SI-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s15
+; SI-NEXT:    s_and_b32 s15, s29, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s93, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 48, v0
+; SI-NEXT:    s_or_b32 s15, s15, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s15
+; SI-NEXT:    s_and_b32 s12, s12, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s46, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 52, v0
+; SI-NEXT:    s_or_b32 s12, s12, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    s_and_b32 s12, s13, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s92, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 56, v0
+; SI-NEXT:    s_or_b32 s12, s12, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_lshl_b32 s12, s44, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 60, v0
+; SI-NEXT:    s_or_b32 s10, s10, s12
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s10, s11, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s91, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 64, v0
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    s_lshl_b32 s10, s42, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x44, v0
+; SI-NEXT:    s_or_b32 s8, s8, s10
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s8, s9, 0xffff
+; SI-NEXT:    s_lshl_b32 s9, s90, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x48, v0
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_lshl_b32 s8, s40, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x4c, v0
+; SI-NEXT:    s_or_b32 s6, s6, s8
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s6, s7, 0xffff
+; SI-NEXT:    s_lshl_b32 s7, s89, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x50, v0
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; SI-NEXT:    s_lshl_b32 s6, s14, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x54, v0
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    s_and_b32 s4, s5, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s88, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x58, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x5c, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    v_readlane_b32 s35, v12, 3
+; SI-NEXT:    v_readlane_b32 s34, v12, 2
+; SI-NEXT:    v_readlane_b32 s31, v12, 1
+; SI-NEXT:    v_readlane_b32 s30, v12, 0
+; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB13_4:
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $sgpr59
-; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $sgpr76
+; SI-NEXT:    ; implicit-def: $sgpr35
+; SI-NEXT:    ; implicit-def: $sgpr74
+; SI-NEXT:    ; implicit-def: $sgpr34
+; SI-NEXT:    ; implicit-def: $sgpr72
+; SI-NEXT:    ; implicit-def: $sgpr31
+; SI-NEXT:    ; implicit-def: $sgpr62
+; SI-NEXT:    ; implicit-def: $sgpr30
+; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr95
 ; SI-NEXT:    ; implicit-def: $sgpr58
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $sgpr57
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $sgpr94
 ; SI-NEXT:    ; implicit-def: $sgpr56
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $sgpr47
-; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $sgpr93
 ; SI-NEXT:    ; implicit-def: $sgpr46
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $sgpr45
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $sgpr92
 ; SI-NEXT:    ; implicit-def: $sgpr44
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $sgpr43
-; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $sgpr91
 ; SI-NEXT:    ; implicit-def: $sgpr42
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $sgpr41
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr90
 ; SI-NEXT:    ; implicit-def: $sgpr40
+; SI-NEXT:    ; implicit-def: $sgpr89
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr88
 ; SI-NEXT:    s_branch .LBB13_2
 ;
 ; VI-LABEL: bitcast_v24i32_to_v48i16_scalar:
@@ -12561,180 +12573,186 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a,
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; SI-NEXT:    v_mov_b32_e32 v26, s16
+; SI-NEXT:    v_mov_b32_e32 v23, s16
 ; SI-NEXT:    v_mov_b32_e32 v24, s17
-; SI-NEXT:    v_mov_b32_e32 v23, s18
+; SI-NEXT:    v_mov_b32_e32 v21, s18
 ; SI-NEXT:    v_mov_b32_e32 v22, s19
-; SI-NEXT:    v_mov_b32_e32 v20, s20
-; SI-NEXT:    v_mov_b32_e32 v19, s21
-; SI-NEXT:    v_mov_b32_e32 v18, s22
-; SI-NEXT:    v_mov_b32_e32 v15, s23
-; SI-NEXT:    v_mov_b32_e32 v17, s24
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    v_mov_b32_e32 v19, s20
+; SI-NEXT:    v_mov_b32_e32 v20, s21
+; SI-NEXT:    v_mov_b32_e32 v17, s22
+; SI-NEXT:    v_mov_b32_e32 v18, s23
+; SI-NEXT:    v_mov_b32_e32 v15, s24
 ; SI-NEXT:    v_mov_b32_e32 v16, s25
-; SI-NEXT:    v_mov_b32_e32 v14, s26
-; SI-NEXT:    v_mov_b32_e32 v13, s27
-; SI-NEXT:    v_mov_b32_e32 v12, s28
-; SI-NEXT:    v_mov_b32_e32 v11, s29
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s26
+; SI-NEXT:    v_mov_b32_e32 v14, s27
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    v_mov_b32_e32 v11, s28
+; SI-NEXT:    v_mov_b32_e32 v12, s29
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB29_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_alignbit_b32 v21, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v25, v8, v7, 16
-; SI-NEXT:    v_alignbit_b32 v27, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v28, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v29, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v31, v11, v12, 16
-; SI-NEXT:    v_alignbit_b32 v34, v13, v14, 16
-; SI-NEXT:    v_alignbit_b32 v36, v16, v17, 16
-; SI-NEXT:    v_alignbit_b32 v38, v15, v18, 16
-; SI-NEXT:    v_alignbit_b32 v48, v19, v20, 16
-; SI-NEXT:    v_alignbit_b32 v51, v22, v23, 16
-; SI-NEXT:    v_alignbit_b32 v53, v24, v26, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v22
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v24
+; SI-NEXT:    v_lshr_b64 v[25:26], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[32:33], v[11:12], 16
+; SI-NEXT:    v_lshr_b64 v[26:27], v[7:8], 16
+; SI-NEXT:    v_lshr_b64 v[33:34], v[15:16], 16
+; SI-NEXT:    v_lshr_b64 v[27:28], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[34:35], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[28:29], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[35:36], v[19:20], 16
+; SI-NEXT:    v_lshr_b64 v[29:30], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[36:37], v[21:22], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v18
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v20
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v22
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v24
+; SI-NEXT:    v_lshr_b64 v[30:31], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[37:38], v[23:24], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB29_3
 ; SI-NEXT:  .LBB29_2: ; %cmp.true
-; SI-NEXT:    v_add_f32_e32 v24, 1.0, v24
-; SI-NEXT:    v_add_f32_e32 v26, 1.0, v26
-; SI-NEXT:    v_add_f32_e32 v22, 1.0, v22
-; SI-NEXT:    v_add_f32_e32 v23, 1.0, v23
-; SI-NEXT:    v_add_f32_e32 v19, 1.0, v19
-; SI-NEXT:    v_add_f32_e32 v20, 1.0, v20
+; SI-NEXT:    v_add_f32_e32 v12, 1.0, v12
+; SI-NEXT:    v_add_f32_e32 v11, 1.0, v11
+; SI-NEXT:    v_add_f32_e32 v10, 1.0, v10
+; SI-NEXT:    v_add_f32_e32 v9, 1.0, v9
+; SI-NEXT:    v_add_f32_e32 v16, 1.0, v16
 ; SI-NEXT:    v_add_f32_e32 v15, 1.0, v15
+; SI-NEXT:    v_add_f32_e32 v8, 1.0, v8
+; SI-NEXT:    v_add_f32_e32 v7, 1.0, v7
+; SI-NEXT:    v_lshr_b64 v[25:26], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[32:33], v[11:12], 16
 ; SI-NEXT:    v_add_f32_e32 v18, 1.0, v18
-; SI-NEXT:    v_add_f32_e32 v16, 1.0, v16
 ; SI-NEXT:    v_add_f32_e32 v17, 1.0, v17
-; SI-NEXT:    v_add_f32_e32 v13, 1.0, v13
-; SI-NEXT:    v_add_f32_e32 v14, 1.0, v14
-; SI-NEXT:    v_add_f32_e32 v11, 1.0, v11
-; SI-NEXT:    v_add_f32_e32 v12, 1.0, v12
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_add_f32_e32 v4, 1.0, v4
-; SI-NEXT:    v_add_f32_e32 v3, 1.0, v3
 ; SI-NEXT:    v_add_f32_e32 v6, 1.0, v6
 ; SI-NEXT:    v_add_f32_e32 v5, 1.0, v5
-; SI-NEXT:    v_add_f32_e32 v8, 1.0, v8
-; SI-NEXT:    v_add_f32_e32 v7, 1.0, v7
-; SI-NEXT:    v_add_f32_e32 v10, 1.0, v10
-; SI-NEXT:    v_add_f32_e32 v9, 1.0, v9
-; SI-NEXT:    v_alignbit_b32 v21, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v25, v8, v7, 16
-; SI-NEXT:    v_alignbit_b32 v27, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v28, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v29, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v31, v11, v12, 16
-; SI-NEXT:    v_alignbit_b32 v34, v13, v14, 16
-; SI-NEXT:    v_alignbit_b32 v36, v16, v17, 16
-; SI-NEXT:    v_alignbit_b32 v38, v15, v18, 16
-; SI-NEXT:    v_alignbit_b32 v48, v19, v20, 16
-; SI-NEXT:    v_alignbit_b32 v51, v22, v23, 16
-; SI-NEXT:    v_alignbit_b32 v53, v24, v26, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v22
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v24
+; SI-NEXT:    v_lshr_b64 v[26:27], v[7:8], 16
+; SI-NEXT:    v_lshr_b64 v[33:34], v[15:16], 16
+; SI-NEXT:    v_add_f32_e32 v20, 1.0, v20
+; SI-NEXT:    v_add_f32_e32 v19, 1.0, v19
+; SI-NEXT:    v_add_f32_e32 v4, 1.0, v4
+; SI-NEXT:    v_add_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_lshr_b64 v[27:28], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[34:35], v[17:18], 16
+; SI-NEXT:    v_add_f32_e32 v22, 1.0, v22
+; SI-NEXT:    v_add_f32_e32 v21, 1.0, v21
+; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[28:29], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[35:36], v[19:20], 16
+; SI-NEXT:    v_add_f32_e32 v24, 1.0, v24
+; SI-NEXT:    v_add_f32_e32 v23, 1.0, v23
+; SI-NEXT:    v_add_f32_e32 v14, 1.0, v14
+; SI-NEXT:    v_add_f32_e32 v13, 1.0, v13
+; SI-NEXT:    v_lshr_b64 v[29:30], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[36:37], v[21:22], 16
+; SI-NEXT:    v_lshr_b64 v[30:31], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[37:38], v[23:24], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v18
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v20
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v22
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v24
 ; SI-NEXT:  .LBB29_3: ; %end
-; SI-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
-; SI-NEXT:    v_or_b32_e32 v26, v26, v53
-; SI-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v40
-; SI-NEXT:    v_or_b32_e32 v24, v24, v26
-; SI-NEXT:    v_add_i32_e32 v26, vcc, 4, v0
-; SI-NEXT:    buffer_store_dword v24, v26, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v37
 ; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; SI-NEXT:    v_or_b32_e32 v23, v23, v31
+; SI-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v51
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v24
+; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v42
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v24
-; SI-NEXT:    v_add_i32_e32 v24, vcc, 8, v0
+; SI-NEXT:    v_add_i32_e32 v24, vcc, 4, v0
 ; SI-NEXT:    buffer_store_dword v23, v24, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v21
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v55
-; SI-NEXT:    v_or_b32_e32 v22, v22, v23
-; SI-NEXT:    v_add_i32_e32 v23, vcc, 12, v0
-; SI-NEXT:    buffer_store_dword v22, v23, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v36
+; SI-NEXT:    v_or_b32_e32 v21, v21, v23
+; SI-NEXT:    v_add_i32_e32 v23, vcc, 8, v0
+; SI-NEXT:    buffer_store_dword v21, v23, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v48
-; SI-NEXT:    v_or_b32_e32 v20, v20, v22
-; SI-NEXT:    v_add_i32_e32 v22, vcc, 16, v0
-; SI-NEXT:    buffer_store_dword v20, v22, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v41
+; SI-NEXT:    v_or_b32_e32 v21, v21, v22
+; SI-NEXT:    v_add_i32_e32 v22, vcc, 12, v0
+; SI-NEXT:    buffer_store_dword v21, v22, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v54
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v35
+; SI-NEXT:    v_or_b32_e32 v19, v19, v21
+; SI-NEXT:    v_add_i32_e32 v21, vcc, 16, v0
+; SI-NEXT:    buffer_store_dword v19, v21, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v20
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v40
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v20
 ; SI-NEXT:    v_add_i32_e32 v20, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v38
-; SI-NEXT:    v_or_b32_e32 v18, v18, v19
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v34
+; SI-NEXT:    v_or_b32_e32 v17, v17, v19
 ; SI-NEXT:    v_add_i32_e32 v19, vcc, 24, v0
-; SI-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; SI-NEXT:    buffer_store_dword v17, v19, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v52
-; SI-NEXT:    v_or_b32_e32 v15, v15, v18
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v55
+; SI-NEXT:    v_or_b32_e32 v17, v17, v18
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 28, v0
-; SI-NEXT:    buffer_store_dword v15, v18, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v36
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v33
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v17
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, 32, v0
 ; SI-NEXT:    buffer_store_dword v15, v17, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v50
+; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v54
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v16
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v34
-; SI-NEXT:    v_or_b32_e32 v14, v14, v15
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v30
+; SI-NEXT:    v_or_b32_e32 v13, v13, v15
 ; SI-NEXT:    v_add_i32_e32 v15, vcc, 40, v0
-; SI-NEXT:    buffer_store_dword v14, v15, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; SI-NEXT:    buffer_store_dword v13, v15, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v49
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v53
 ; SI-NEXT:    v_or_b32_e32 v13, v13, v14
 ; SI-NEXT:    v_add_i32_e32 v14, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v12
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v31
-; SI-NEXT:    v_or_b32_e32 v12, v12, v13
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v32
+; SI-NEXT:    v_or_b32_e32 v11, v11, v13
 ; SI-NEXT:    v_add_i32_e32 v13, vcc, 48, v0
-; SI-NEXT:    buffer_store_dword v12, v13, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
+; SI-NEXT:    buffer_store_dword v11, v13, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v39
+; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v12
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v52
 ; SI-NEXT:    v_or_b32_e32 v11, v11, v12
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v11, v12, s[0:3], 0 offen
@@ -12746,7 +12764,7 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v11, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v37
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v51
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -12758,7 +12776,7 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v35
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v50
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -12770,62 +12788,64 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v33
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v49
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v25
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v26
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x50, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v32
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v48
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v21
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v25
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v30
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v39
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB29_4:
-; SI-NEXT:    ; implicit-def: $vgpr53
+; SI-NEXT:    ; implicit-def: $vgpr37
+; SI-NEXT:    ; implicit-def: $vgpr42
+; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr40
-; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr55
-; SI-NEXT:    ; implicit-def: $vgpr48
+; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    ; implicit-def: $vgpr54
-; SI-NEXT:    ; implicit-def: $vgpr38
+; SI-NEXT:    ; implicit-def: $vgpr30
+; SI-NEXT:    ; implicit-def: $vgpr53
 ; SI-NEXT:    ; implicit-def: $vgpr52
-; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    ; implicit-def: $vgpr51
 ; SI-NEXT:    ; implicit-def: $vgpr50
-; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr39
+; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr26
 ; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    s_branch .LBB29_2
 ;
 ; VI-LABEL: bitcast_v24f32_to_v48i16_scalar:
@@ -21128,289 +21148,301 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3
 ; SI-LABEL: bitcast_v12i64_to_v48i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_store_dword v12, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_writelane_b32 v12, s30, 0
+; SI-NEXT:    v_writelane_b32 v12, s31, 1
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; SI-NEXT:    v_readfirstlane_b32 s15, v1
-; SI-NEXT:    v_readfirstlane_b32 s14, v2
-; SI-NEXT:    v_readfirstlane_b32 s13, v3
-; SI-NEXT:    v_readfirstlane_b32 s12, v4
-; SI-NEXT:    v_readfirstlane_b32 s11, v5
-; SI-NEXT:    v_readfirstlane_b32 s10, v6
-; SI-NEXT:    v_readfirstlane_b32 s9, v7
-; SI-NEXT:    v_readfirstlane_b32 s8, v8
-; SI-NEXT:    v_readfirstlane_b32 s7, v9
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_readfirstlane_b32 s6, v10
+; SI-NEXT:    v_writelane_b32 v12, s34, 2
+; SI-NEXT:    v_readfirstlane_b32 s12, v1
+; SI-NEXT:    v_readfirstlane_b32 s13, v2
+; SI-NEXT:    v_readfirstlane_b32 s10, v3
+; SI-NEXT:    v_readfirstlane_b32 s11, v4
+; SI-NEXT:    v_readfirstlane_b32 s8, v5
+; SI-NEXT:    v_readfirstlane_b32 s9, v6
+; SI-NEXT:    v_readfirstlane_b32 s6, v7
+; SI-NEXT:    v_readfirstlane_b32 s7, v8
+; SI-NEXT:    v_readfirstlane_b32 s4, v9
+; SI-NEXT:    s_and_b64 s[14:15], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s5, v10
+; SI-NEXT:    v_writelane_b32 v12, s35, 3
 ; SI-NEXT:    s_cbranch_scc0 .LBB41_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s13
-; SI-NEXT:    v_mov_b32_e32 v5, s15
-; SI-NEXT:    v_mov_b32_e32 v6, s28
-; SI-NEXT:    v_mov_b32_e32 v7, s26
-; SI-NEXT:    v_mov_b32_e32 v8, s24
-; SI-NEXT:    v_mov_b32_e32 v9, s22
-; SI-NEXT:    v_mov_b32_e32 v10, s20
-; SI-NEXT:    v_mov_b32_e32 v11, s18
-; SI-NEXT:    v_mov_b32_e32 v12, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s10, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s12, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s14, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s29, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s27, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s25, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s23, v9, 16
-; SI-NEXT:    v_alignbit_b32 v10, s21, v10, 16
-; SI-NEXT:    v_alignbit_b32 v11, s19, v11, 16
-; SI-NEXT:    v_alignbit_b32 v12, s17, v12, 16
-; SI-NEXT:    s_lshr_b32 s40, s6, 16
-; SI-NEXT:    s_lshr_b32 s41, s8, 16
-; SI-NEXT:    s_lshr_b32 s42, s10, 16
-; SI-NEXT:    s_lshr_b32 s43, s12, 16
-; SI-NEXT:    s_lshr_b32 s44, s14, 16
-; SI-NEXT:    s_lshr_b32 s45, s29, 16
-; SI-NEXT:    s_lshr_b32 s46, s27, 16
-; SI-NEXT:    s_lshr_b32 s47, s25, 16
-; SI-NEXT:    s_lshr_b32 s56, s23, 16
-; SI-NEXT:    s_lshr_b32 s57, s21, 16
-; SI-NEXT:    s_lshr_b32 s58, s19, 16
-; SI-NEXT:    s_lshr_b32 s59, s17, 16
+; SI-NEXT:    s_lshr_b32 s88, s5, 16
+; SI-NEXT:    s_lshr_b32 s89, s7, 16
+; SI-NEXT:    s_lshr_b32 s90, s9, 16
+; SI-NEXT:    s_lshr_b32 s91, s11, 16
+; SI-NEXT:    s_lshr_b32 s92, s13, 16
+; SI-NEXT:    s_lshr_b32 s93, s29, 16
+; SI-NEXT:    s_lshr_b32 s94, s27, 16
+; SI-NEXT:    s_lshr_b32 s95, s25, 16
+; SI-NEXT:    s_lshr_b32 s30, s23, 16
+; SI-NEXT:    s_lshr_b32 s31, s21, 16
+; SI-NEXT:    s_lshr_b32 s34, s19, 16
+; SI-NEXT:    s_lshr_b32 s35, s17, 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[72:73], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[74:75], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB41_3
 ; SI-NEXT:  .LBB41_2: ; %cmp.true
-; SI-NEXT:    s_add_u32 s16, s16, 3
-; SI-NEXT:    s_addc_u32 s17, s17, 0
-; SI-NEXT:    s_add_u32 s18, s18, 3
-; SI-NEXT:    s_addc_u32 s19, s19, 0
-; SI-NEXT:    s_add_u32 s20, s20, 3
-; SI-NEXT:    s_addc_u32 s21, s21, 0
-; SI-NEXT:    s_add_u32 s22, s22, 3
-; SI-NEXT:    s_addc_u32 s23, s23, 0
-; SI-NEXT:    s_add_u32 s24, s24, 3
-; SI-NEXT:    s_addc_u32 s25, s25, 0
-; SI-NEXT:    s_add_u32 s26, s26, 3
-; SI-NEXT:    s_addc_u32 s27, s27, 0
+; SI-NEXT:    s_add_u32 s4, s4, 3
+; SI-NEXT:    s_addc_u32 s5, s5, 0
+; SI-NEXT:    s_add_u32 s6, s6, 3
+; SI-NEXT:    s_addc_u32 s7, s7, 0
+; SI-NEXT:    s_add_u32 s8, s8, 3
+; SI-NEXT:    s_addc_u32 s9, s9, 0
+; SI-NEXT:    s_add_u32 s10, s10, 3
+; SI-NEXT:    s_addc_u32 s11, s11, 0
+; SI-NEXT:    s_add_u32 s12, s12, 3
+; SI-NEXT:    s_addc_u32 s13, s13, 0
 ; SI-NEXT:    s_add_u32 s28, s28, 3
 ; SI-NEXT:    s_addc_u32 s29, s29, 0
-; SI-NEXT:    s_add_u32 s15, s15, 3
-; SI-NEXT:    s_addc_u32 s14, s14, 0
-; SI-NEXT:    s_add_u32 s13, s13, 3
-; SI-NEXT:    s_addc_u32 s12, s12, 0
-; SI-NEXT:    s_add_u32 s11, s11, 3
-; SI-NEXT:    s_addc_u32 s10, s10, 0
-; SI-NEXT:    s_add_u32 s9, s9, 3
-; SI-NEXT:    s_addc_u32 s8, s8, 0
-; SI-NEXT:    s_add_u32 s7, s7, 3
-; SI-NEXT:    s_addc_u32 s6, s6, 0
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s13
-; SI-NEXT:    v_mov_b32_e32 v5, s15
-; SI-NEXT:    v_mov_b32_e32 v6, s28
-; SI-NEXT:    v_mov_b32_e32 v7, s26
-; SI-NEXT:    v_mov_b32_e32 v8, s24
-; SI-NEXT:    v_mov_b32_e32 v9, s22
-; SI-NEXT:    v_mov_b32_e32 v10, s20
-; SI-NEXT:    v_mov_b32_e32 v11, s18
-; SI-NEXT:    v_mov_b32_e32 v12, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s10, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s12, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s14, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s29, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s27, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s25, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s23, v9, 16
-; SI-NEXT:    v_alignbit_b32 v10, s21, v10, 16
-; SI-NEXT:    v_alignbit_b32 v11, s19, v11, 16
-; SI-NEXT:    v_alignbit_b32 v12, s17, v12, 16
-; SI-NEXT:    s_lshr_b32 s40, s6, 16
-; SI-NEXT:    s_lshr_b32 s41, s8, 16
-; SI-NEXT:    s_lshr_b32 s42, s10, 16
-; SI-NEXT:    s_lshr_b32 s43, s12, 16
-; SI-NEXT:    s_lshr_b32 s44, s14, 16
-; SI-NEXT:    s_lshr_b32 s45, s29, 16
-; SI-NEXT:    s_lshr_b32 s46, s27, 16
-; SI-NEXT:    s_lshr_b32 s47, s25, 16
-; SI-NEXT:    s_lshr_b32 s56, s23, 16
-; SI-NEXT:    s_lshr_b32 s57, s21, 16
-; SI-NEXT:    s_lshr_b32 s58, s19, 16
-; SI-NEXT:    s_lshr_b32 s59, s17, 16
+; SI-NEXT:    s_add_u32 s26, s26, 3
+; SI-NEXT:    s_addc_u32 s27, s27, 0
+; SI-NEXT:    s_add_u32 s24, s24, 3
+; SI-NEXT:    s_addc_u32 s25, s25, 0
+; SI-NEXT:    s_add_u32 s22, s22, 3
+; SI-NEXT:    s_addc_u32 s23, s23, 0
+; SI-NEXT:    s_add_u32 s20, s20, 3
+; SI-NEXT:    s_addc_u32 s21, s21, 0
+; SI-NEXT:    s_add_u32 s18, s18, 3
+; SI-NEXT:    s_addc_u32 s19, s19, 0
+; SI-NEXT:    s_add_u32 s16, s16, 3
+; SI-NEXT:    s_addc_u32 s17, s17, 0
+; SI-NEXT:    s_lshr_b32 s88, s5, 16
+; SI-NEXT:    s_lshr_b32 s89, s7, 16
+; SI-NEXT:    s_lshr_b32 s90, s9, 16
+; SI-NEXT:    s_lshr_b32 s91, s11, 16
+; SI-NEXT:    s_lshr_b32 s92, s13, 16
+; SI-NEXT:    s_lshr_b32 s93, s29, 16
+; SI-NEXT:    s_lshr_b32 s94, s27, 16
+; SI-NEXT:    s_lshr_b32 s95, s25, 16
+; SI-NEXT:    s_lshr_b32 s30, s23, 16
+; SI-NEXT:    s_lshr_b32 s31, s21, 16
+; SI-NEXT:    s_lshr_b32 s34, s19, 16
+; SI-NEXT:    s_lshr_b32 s35, s17, 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[72:73], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[74:75], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[16:17], 16
 ; SI-NEXT:  .LBB41_3: ; %end
-; SI-NEXT:    s_and_b32 s4, s16, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; SI-NEXT:    v_or_b32_e32 v12, s4, v12
-; SI-NEXT:    s_and_b32 s4, s17, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s59, 16
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_mov_b32_e32 v13, s4
-; SI-NEXT:    s_and_b32 s4, s18, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; SI-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 4, v0
-; SI-NEXT:    v_or_b32_e32 v11, s4, v11
-; SI-NEXT:    s_and_b32 s4, s19, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s58, 16
-; SI-NEXT:    buffer_store_dword v13, v12, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 8, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v11, v12, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v12, s4
-; SI-NEXT:    s_and_b32 s4, s20, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    s_lshl_b32 s15, s76, 16
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s15, s16, s15
+; SI-NEXT:    v_mov_b32_e32 v1, s15
+; SI-NEXT:    s_and_b32 s15, s17, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s35, 16
+; SI-NEXT:    s_or_b32 s15, s15, s16
+; SI-NEXT:    v_mov_b32_e32 v2, s15
+; SI-NEXT:    s_lshl_b32 s15, s74, 16
+; SI-NEXT:    s_and_b32 s16, s18, 0xffff
+; SI-NEXT:    s_or_b32 s15, s16, s15
+; SI-NEXT:    v_mov_b32_e32 v3, s15
+; SI-NEXT:    s_and_b32 s15, s19, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s34, 16
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 12, v0
-; SI-NEXT:    v_or_b32_e32 v10, s4, v10
-; SI-NEXT:    s_and_b32 s4, s21, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s57, 16
-; SI-NEXT:    buffer_store_dword v12, v11, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 16, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v11, s4
-; SI-NEXT:    s_and_b32 s4, s22, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; SI-NEXT:    s_or_b32 s15, s15, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s15
+; SI-NEXT:    s_and_b32 s15, s20, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s72, 16
+; SI-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; SI-NEXT:    s_or_b32 s15, s15, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 20, v0
-; SI-NEXT:    v_or_b32_e32 v9, s4, v9
-; SI-NEXT:    s_and_b32 s4, s23, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s56, 16
-; SI-NEXT:    buffer_store_dword v11, v10, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 24, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v10, s4
-; SI-NEXT:    s_and_b32 s4, s24, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT:    v_mov_b32_e32 v2, s15
+; SI-NEXT:    s_and_b32 s15, s21, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s31, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; SI-NEXT:    s_or_b32 s15, s15, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 28, v0
-; SI-NEXT:    v_or_b32_e32 v8, s4, v8
-; SI-NEXT:    s_and_b32 s4, s25, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s47, 16
-; SI-NEXT:    buffer_store_dword v10, v9, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 32, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v8, v9, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v9, s4
-; SI-NEXT:    s_and_b32 s4, s26, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_mov_b32_e32 v2, s15
+; SI-NEXT:    s_and_b32 s15, s22, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s62, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 20, v0
+; SI-NEXT:    s_or_b32 s15, s15, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 36, v0
-; SI-NEXT:    v_or_b32_e32 v7, s4, v7
-; SI-NEXT:    s_and_b32 s4, s27, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s46, 16
-; SI-NEXT:    buffer_store_dword v9, v8, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 40, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v8, s4
-; SI-NEXT:    s_and_b32 s4, s28, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT:    v_mov_b32_e32 v2, s15
+; SI-NEXT:    s_and_b32 s15, s23, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s30, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 24, v0
+; SI-NEXT:    s_or_b32 s15, s15, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 44, v0
-; SI-NEXT:    v_or_b32_e32 v6, s4, v6
-; SI-NEXT:    s_and_b32 s4, s29, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s45, 16
-; SI-NEXT:    buffer_store_dword v8, v7, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    s_and_b32 s4, s15, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_mov_b32_e32 v2, s15
+; SI-NEXT:    s_and_b32 s15, s24, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s60, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s15, s15, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 52, v0
-; SI-NEXT:    v_or_b32_e32 v5, s4, v5
-; SI-NEXT:    s_and_b32 s4, s14, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s44, 16
-; SI-NEXT:    buffer_store_dword v7, v6, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 56, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v6, s4
-; SI-NEXT:    s_and_b32 s4, s13, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_mov_b32_e32 v2, s15
+; SI-NEXT:    s_and_b32 s15, s25, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s95, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v0
+; SI-NEXT:    s_or_b32 s15, s15, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 60, v0
-; SI-NEXT:    v_or_b32_e32 v4, s4, v4
-; SI-NEXT:    s_and_b32 s4, s12, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s43, 16
-; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 64, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v5, s4
-; SI-NEXT:    s_and_b32 s4, s11, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_mov_b32_e32 v2, s15
+; SI-NEXT:    s_and_b32 s15, s26, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s58, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 36, v0
+; SI-NEXT:    s_or_b32 s15, s15, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x44, v0
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s10, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s42, 16
-; SI-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v4, s4
-; SI-NEXT:    s_and_b32 s4, s9, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_mov_b32_e32 v2, s15
+; SI-NEXT:    s_and_b32 s15, s27, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s94, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 40, v0
+; SI-NEXT:    s_or_b32 s15, s15, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
-; SI-NEXT:    v_or_b32_e32 v2, s4, v2
-; SI-NEXT:    s_and_b32 s4, s8, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s41, 16
-; SI-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x50, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v3, s4
-; SI-NEXT:    s_and_b32 s4, s7, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_mov_b32_e32 v2, s15
+; SI-NEXT:    s_and_b32 s15, s28, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s56, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 44, v0
+; SI-NEXT:    s_or_b32 s15, s15, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
-; SI-NEXT:    v_or_b32_e32 v1, s4, v1
-; SI-NEXT:    s_and_b32 s4, s6, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s40, 16
-; SI-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s15
+; SI-NEXT:    s_and_b32 s15, s29, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s93, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 48, v0
+; SI-NEXT:    s_or_b32 s15, s15, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s15
+; SI-NEXT:    s_and_b32 s12, s12, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s46, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 52, v0
+; SI-NEXT:    s_or_b32 s12, s12, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    s_and_b32 s12, s13, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s92, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 56, v0
+; SI-NEXT:    s_or_b32 s12, s12, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_lshl_b32 s12, s44, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 60, v0
+; SI-NEXT:    s_or_b32 s10, s10, s12
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s10, s11, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s91, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 64, v0
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    s_lshl_b32 s10, s42, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x44, v0
+; SI-NEXT:    s_or_b32 s8, s8, s10
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s8, s9, 0xffff
+; SI-NEXT:    s_lshl_b32 s9, s90, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x48, v0
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_lshl_b32 s8, s40, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x4c, v0
+; SI-NEXT:    s_or_b32 s6, s6, s8
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s6, s7, 0xffff
+; SI-NEXT:    s_lshl_b32 s7, s89, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x50, v0
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; SI-NEXT:    s_lshl_b32 s6, s14, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x54, v0
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    s_and_b32 s4, s5, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s88, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x58, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x5c, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    v_readlane_b32 s35, v12, 3
+; SI-NEXT:    v_readlane_b32 s34, v12, 2
+; SI-NEXT:    v_readlane_b32 s31, v12, 1
+; SI-NEXT:    v_readlane_b32 s30, v12, 0
+; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB41_4:
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $sgpr59
-; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $sgpr76
+; SI-NEXT:    ; implicit-def: $sgpr35
+; SI-NEXT:    ; implicit-def: $sgpr74
+; SI-NEXT:    ; implicit-def: $sgpr34
+; SI-NEXT:    ; implicit-def: $sgpr72
+; SI-NEXT:    ; implicit-def: $sgpr31
+; SI-NEXT:    ; implicit-def: $sgpr62
+; SI-NEXT:    ; implicit-def: $sgpr30
+; SI-NEXT:    ; implicit-def: $sgpr60
+; SI-NEXT:    ; implicit-def: $sgpr95
 ; SI-NEXT:    ; implicit-def: $sgpr58
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $sgpr57
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $sgpr94
 ; SI-NEXT:    ; implicit-def: $sgpr56
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $sgpr47
-; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $sgpr93
 ; SI-NEXT:    ; implicit-def: $sgpr46
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $sgpr45
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $sgpr92
 ; SI-NEXT:    ; implicit-def: $sgpr44
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $sgpr43
-; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $sgpr91
 ; SI-NEXT:    ; implicit-def: $sgpr42
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $sgpr41
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr90
 ; SI-NEXT:    ; implicit-def: $sgpr40
+; SI-NEXT:    ; implicit-def: $sgpr89
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr88
 ; SI-NEXT:    s_branch .LBB41_2
 ;
 ; VI-LABEL: bitcast_v12i64_to_v48i16_scalar:
@@ -28931,153 +28963,159 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a,
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_mov_b32_e32 v11, s28
 ; SI-NEXT:    v_mov_b32_e32 v12, s29
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB49_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_alignbit_b32 v25, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v26, v8, v7, 16
-; SI-NEXT:    v_alignbit_b32 v27, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v28, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v29, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v31, v12, v11, 16
-; SI-NEXT:    v_alignbit_b32 v33, v14, v13, 16
-; SI-NEXT:    v_alignbit_b32 v35, v16, v15, 16
-; SI-NEXT:    v_alignbit_b32 v38, v18, v17, 16
-; SI-NEXT:    v_alignbit_b32 v48, v20, v19, 16
-; SI-NEXT:    v_alignbit_b32 v50, v22, v21, 16
-; SI-NEXT:    v_alignbit_b32 v53, v24, v23, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v22
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v24
+; SI-NEXT:    v_lshr_b64 v[25:26], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[32:33], v[11:12], 16
+; SI-NEXT:    v_lshr_b64 v[26:27], v[7:8], 16
+; SI-NEXT:    v_lshr_b64 v[33:34], v[15:16], 16
+; SI-NEXT:    v_lshr_b64 v[27:28], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[34:35], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[28:29], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[35:36], v[19:20], 16
+; SI-NEXT:    v_lshr_b64 v[29:30], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[36:37], v[21:22], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v18
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v20
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v22
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v24
+; SI-NEXT:    v_lshr_b64 v[30:31], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[37:38], v[23:24], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB49_3
 ; SI-NEXT:  .LBB49_2: ; %cmp.true
-; SI-NEXT:    v_add_f64 v[23:24], v[23:24], 1.0
-; SI-NEXT:    v_add_f64 v[21:22], v[21:22], 1.0
-; SI-NEXT:    v_add_f64 v[19:20], v[19:20], 1.0
-; SI-NEXT:    v_add_f64 v[17:18], v[17:18], 1.0
-; SI-NEXT:    v_add_f64 v[15:16], v[15:16], 1.0
-; SI-NEXT:    v_add_f64 v[13:14], v[13:14], 1.0
-; SI-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
-; SI-NEXT:    v_add_f64 v[1:2], v[1:2], 1.0
-; SI-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
-; SI-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
 ; SI-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
+; SI-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
 ; SI-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
-; SI-NEXT:    v_alignbit_b32 v25, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v26, v8, v7, 16
-; SI-NEXT:    v_alignbit_b32 v27, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v28, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v29, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v31, v12, v11, 16
-; SI-NEXT:    v_alignbit_b32 v33, v14, v13, 16
-; SI-NEXT:    v_alignbit_b32 v35, v16, v15, 16
-; SI-NEXT:    v_alignbit_b32 v38, v18, v17, 16
-; SI-NEXT:    v_alignbit_b32 v48, v20, v19, 16
-; SI-NEXT:    v_alignbit_b32 v50, v22, v21, 16
-; SI-NEXT:    v_alignbit_b32 v53, v24, v23, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v22
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v24
+; SI-NEXT:    v_add_f64 v[15:16], v[15:16], 1.0
+; SI-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
+; SI-NEXT:    v_add_f64 v[17:18], v[17:18], 1.0
+; SI-NEXT:    v_lshr_b64 v[25:26], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[32:33], v[11:12], 16
+; SI-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
+; SI-NEXT:    v_add_f64 v[19:20], v[19:20], 1.0
+; SI-NEXT:    v_lshr_b64 v[26:27], v[7:8], 16
+; SI-NEXT:    v_lshr_b64 v[33:34], v[15:16], 16
+; SI-NEXT:    v_add_f64 v[1:2], v[1:2], 1.0
+; SI-NEXT:    v_add_f64 v[21:22], v[21:22], 1.0
+; SI-NEXT:    v_lshr_b64 v[27:28], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[34:35], v[17:18], 16
+; SI-NEXT:    v_add_f64 v[13:14], v[13:14], 1.0
+; SI-NEXT:    v_add_f64 v[23:24], v[23:24], 1.0
+; SI-NEXT:    v_lshr_b64 v[28:29], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[35:36], v[19:20], 16
+; SI-NEXT:    v_lshr_b64 v[29:30], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[36:37], v[21:22], 16
+; SI-NEXT:    v_lshr_b64 v[30:31], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[37:38], v[23:24], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v18
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v20
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v22
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v24
 ; SI-NEXT:  .LBB49_3: ; %end
+; SI-NEXT:    v_lshlrev_b32_e32 v31, 16, v37
 ; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
-; SI-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
-; SI-NEXT:    v_or_b32_e32 v23, v23, v53
+; SI-NEXT:    v_or_b32_e32 v23, v23, v31
 ; SI-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v40
+; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v42
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v24
 ; SI-NEXT:    v_add_i32_e32 v24, vcc, 4, v0
 ; SI-NEXT:    buffer_store_dword v23, v24, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v21
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v50
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v36
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v23
 ; SI-NEXT:    v_add_i32_e32 v23, vcc, 8, v0
 ; SI-NEXT:    buffer_store_dword v21, v23, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v55
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v41
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v22
 ; SI-NEXT:    v_add_i32_e32 v22, vcc, 12, v0
 ; SI-NEXT:    buffer_store_dword v21, v22, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v48
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v35
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v21
 ; SI-NEXT:    v_add_i32_e32 v21, vcc, 16, v0
 ; SI-NEXT:    buffer_store_dword v19, v21, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v54
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v40
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v20
 ; SI-NEXT:    v_add_i32_e32 v20, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v38
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v34
 ; SI-NEXT:    v_or_b32_e32 v17, v17, v19
 ; SI-NEXT:    v_add_i32_e32 v19, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v17, v19, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v52
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v55
 ; SI-NEXT:    v_or_b32_e32 v17, v17, v18
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v35
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v33
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v17
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, 32, v0
 ; SI-NEXT:    buffer_store_dword v15, v17, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v51
+; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v54
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v16
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v33
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v30
 ; SI-NEXT:    v_or_b32_e32 v13, v13, v15
 ; SI-NEXT:    v_add_i32_e32 v15, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v13, v15, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v49
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v53
 ; SI-NEXT:    v_or_b32_e32 v13, v13, v14
 ; SI-NEXT:    v_add_i32_e32 v14, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v11
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v31
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v32
 ; SI-NEXT:    v_or_b32_e32 v11, v11, v13
 ; SI-NEXT:    v_add_i32_e32 v13, vcc, 48, v0
 ; SI-NEXT:    buffer_store_dword v11, v13, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v39
+; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v52
 ; SI-NEXT:    v_or_b32_e32 v11, v11, v12
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v11, v12, s[0:3], 0 offen
@@ -29089,7 +29127,7 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v11, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v37
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v51
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -29101,7 +29139,7 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v36
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v50
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -29113,7 +29151,7 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v49
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -29125,7 +29163,7 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v32
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v48
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -29137,38 +29175,40 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v30
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v39
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB49_4:
-; SI-NEXT:    ; implicit-def: $vgpr53
+; SI-NEXT:    ; implicit-def: $vgpr37
+; SI-NEXT:    ; implicit-def: $vgpr42
+; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr40
-; SI-NEXT:    ; implicit-def: $vgpr50
+; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr55
-; SI-NEXT:    ; implicit-def: $vgpr48
+; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    ; implicit-def: $vgpr54
-; SI-NEXT:    ; implicit-def: $vgpr38
+; SI-NEXT:    ; implicit-def: $vgpr30
+; SI-NEXT:    ; implicit-def: $vgpr53
 ; SI-NEXT:    ; implicit-def: $vgpr52
-; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr51
-; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; implicit-def: $vgpr31
+; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr39
+; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr25
-; SI-NEXT:    ; implicit-def: $vgpr30
 ; SI-NEXT:    s_branch .LBB49_2
 ;
 ; VI-LABEL: bitcast_v12f64_to_v48i16_scalar:
@@ -39277,428 +39317,464 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i
 ; SI-LABEL: bitcast_v48f16_to_v48i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:8
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v55, v3
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v55, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v38, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v31, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v51, v20
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v23
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, v25
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v26
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v63, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v44, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v47, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v51, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v45, v22
+; SI-NEXT:    v_cvt_f16_f32_e32 v49, v23
+; SI-NEXT:    v_cvt_f16_f32_e32 v40, v26
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v41, v30
+; SI-NEXT:    v_cvt_f16_f32_e32 v62, s16
+; SI-NEXT:    v_cvt_f16_f32_e32 v42, s19
+; SI-NEXT:    v_cvt_f16_f32_e32 v59, s20
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, s23
+; SI-NEXT:    v_cvt_f16_f32_e32 v57, s24
+; SI-NEXT:    v_cvt_f16_f32_e32 v46, s27
+; SI-NEXT:    v_cvt_f16_f32_e32 v26, s28
+; SI-NEXT:    v_cvt_f16_f32_e32 v22, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v48, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v38, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v30, v24
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v27, v28
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v29
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v30
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, s16
-; SI-NEXT:    v_cvt_f16_f32_e32 v22, s18
-; SI-NEXT:    v_cvt_f16_f32_e32 v20, s19
-; SI-NEXT:    v_cvt_f16_f32_e32 v24, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v30, s21
-; SI-NEXT:    v_cvt_f16_f32_e32 v23, s22
-; SI-NEXT:    v_cvt_f16_f32_e32 v19, s23
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, s24
-; SI-NEXT:    v_cvt_f16_f32_e32 v29, s25
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, s26
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, s27
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, s28
-; SI-NEXT:    v_cvt_f16_f32_e32 v28, s29
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v29
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, s25
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, s26
+; SI-NEXT:    v_cvt_f16_f32_e32 v29, s29
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v40
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cvt_f16_f32_e32 v26, v41
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v32
 ; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v42
+; SI-NEXT:    v_cvt_f16_f32_e32 v43, v33
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v20
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v43
-; SI-NEXT:    v_cvt_f16_f32_e32 v40, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v25, v35
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v44
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v37
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, s22
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_cbranch_scc0 .LBB59_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_cbranch_execnz .LBB59_3
 ; SI-NEXT:  .LBB59_2: ; %cmp.true
-; SI-NEXT:    v_cvt_f32_f16_e32 v40, v40
-; SI-NEXT:    v_cvt_f32_f16_e32 v30, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v55, v55
-; SI-NEXT:    v_cvt_f32_f16_e32 v29, v29
-; SI-NEXT:    v_add_f32_e32 v40, 0x38000000, v40
-; SI-NEXT:    v_cvt_f16_f32_e32 v40, v40
-; SI-NEXT:    v_add_f32_e32 v41, 0x38000000, v30
-; SI-NEXT:    v_add_f32_e32 v55, 0x38000000, v55
-; SI-NEXT:    v_cvt_f32_f16_e32 v54, v54
-; SI-NEXT:    v_cvt_f32_f16_e32 v53, v53
-; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v40
-; SI-NEXT:    v_cvt_f16_f32_e32 v40, v41
-; SI-NEXT:    v_cvt_f16_f32_e32 v55, v55
-; SI-NEXT:    v_add_f32_e32 v29, 0x38000000, v29
-; SI-NEXT:    v_add_f32_e32 v54, 0x38000000, v54
-; SI-NEXT:    v_add_f32_e32 v53, 0x38000000, v53
-; SI-NEXT:    v_cvt_f16_f32_e32 v41, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v40
-; SI-NEXT:    v_cvt_f16_f32_e32 v40, v54
-; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v55
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v53
-; SI-NEXT:    v_cvt_f32_f16_e32 v55, v52
-; SI-NEXT:    v_cvt_f32_f16_e32 v51, v51
-; SI-NEXT:    v_cvt_f32_f16_e32 v50, v50
-; SI-NEXT:    v_lshlrev_b32_e32 v52, 16, v53
-; SI-NEXT:    v_add_f32_e32 v53, 0x38000000, v55
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v53
-; SI-NEXT:    v_add_f32_e32 v51, 0x38000000, v51
-; SI-NEXT:    v_add_f32_e32 v50, 0x38000000, v50
-; SI-NEXT:    v_cvt_f16_f32_e32 v55, v51
-; SI-NEXT:    v_lshlrev_b32_e32 v51, 16, v53
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
-; SI-NEXT:    v_cvt_f32_f16_e32 v53, v27
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v54
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v52
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v50
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v29
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v50
-; SI-NEXT:    v_add_f32_e32 v50, 0x38000000, v53
-; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v1
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v3
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v25
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v38
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v57
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v48
+; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v59
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v62
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_or_b32_e32 v59, v3, v19
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_or_b32_e32 v62, v1, v23
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v26
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v55
+; SI-NEXT:    v_or_b32_e32 v57, v3, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v44
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v35
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, v33
+; SI-NEXT:    v_or_b32_e32 v1, v1, v21
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v36
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_or_b32_e32 v35, v5, v17
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v11
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_or_b32_e32 v3, v3, v13
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v34
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v53
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_or_b32_e32 v33, v5, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v32
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
-; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v34, v34
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v37
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v51
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_or_b32_e32 v32, v1, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v30
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v49
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_or_b32_e32 v29, v3, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v26, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v39
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_or_b32_e32 v28, v26, v5
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v26, v31
+; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
+; SI-NEXT:    v_or_b32_e32 v27, v1, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v25
+; SI-NEXT:    v_cvt_f32_f16_e32 v25, v43
 ; SI-NEXT:    v_add_f32_e32 v26, 0x38000000, v26
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v25
+; SI-NEXT:    v_cvt_f16_f32_e32 v43, v25
+; SI-NEXT:    v_or_b32_e32 v44, v26, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v25, v41
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v43
+; SI-NEXT:    v_or_b32_e32 v2, v2, v26
+; SI-NEXT:    v_cvt_f32_f16_e32 v26, v40
+; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v25
+; SI-NEXT:    v_cvt_f16_f32_e32 v41, v25
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v26
+; SI-NEXT:    v_cvt_f32_f16_e32 v26, v45
+; SI-NEXT:    v_cvt_f16_f32_e32 v40, v25
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v41
+; SI-NEXT:    v_or_b32_e32 v4, v4, v25
+; SI-NEXT:    v_add_f32_e32 v26, 0x38000000, v26
+; SI-NEXT:    v_cvt_f16_f32_e32 v45, v26
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v40
+; SI-NEXT:    v_or_b32_e32 v6, v6, v25
+; SI-NEXT:    v_cvt_f32_f16_e32 v25, v56
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v45
+; SI-NEXT:    v_or_b32_e32 v8, v8, v26
+; SI-NEXT:    v_cvt_f32_f16_e32 v26, v47
+; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v25
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, v25
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v26
+; SI-NEXT:    v_cvt_f32_f16_e32 v26, v60
 ; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
-; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
-; SI-NEXT:    v_cvt_f32_f16_e32 v38, v38
-; SI-NEXT:    v_cvt_f32_f16_e32 v36, v36
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v50
-; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v47, v25
 ; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
-; SI-NEXT:    v_cvt_f32_f16_e32 v49, v49
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; SI-NEXT:    v_or_b32_e32 v2, v2, v50
-; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_add_f32_e32 v26, 0x38000000, v26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v35
-; SI-NEXT:    v_add_f32_e32 v33, 0x38000000, v33
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v26
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v56
+; SI-NEXT:    v_or_b32_e32 v10, v10, v25
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v47
+; SI-NEXT:    v_or_b32_e32 v12, v12, v25
+; SI-NEXT:    v_cvt_f32_f16_e32 v25, v63
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v60
+; SI-NEXT:    v_or_b32_e32 v14, v14, v26
 ; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; SI-NEXT:    v_or_b32_e32 v5, v5, v50
-; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v33
-; SI-NEXT:    v_add_f32_e32 v34, 0x38000000, v34
-; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
-; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
-; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
-; SI-NEXT:    v_cvt_f32_f16_e32 v28, v28
-; SI-NEXT:    v_or_b32_e32 v11, v11, v50
-; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v34
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT:    v_add_f32_e32 v38, 0x38000000, v38
-; SI-NEXT:    v_add_f32_e32 v36, 0x38000000, v36
+; SI-NEXT:    v_cvt_f32_f16_e32 v26, v61
 ; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
-; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
-; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v48
-; SI-NEXT:    v_cvt_f32_f16_e32 v39, v39
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT:    v_or_b32_e32 v10, v10, v50
-; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v38, v38
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v36
-; SI-NEXT:    v_add_f32_e32 v49, 0x38000000, v49
-; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
-; SI-NEXT:    v_or_b32_e32 v14, v14, v50
-; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, v49
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v25
+; SI-NEXT:    v_cvt_f16_f32_e32 v63, v25
 ; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
-; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
-; SI-NEXT:    v_or_b32_e32 v35, v35, v50
-; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v33
+; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v26
 ; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; SI-NEXT:    v_add_f32_e32 v23, 0x38000000, v23
-; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
-; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
-; SI-NEXT:    v_or_b32_e32 v34, v34, v50
-; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v37
-; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, v25
 ; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v22
+; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v63
+; SI-NEXT:    v_or_b32_e32 v18, v18, v25
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v61
+; SI-NEXT:    v_or_b32_e32 v22, v22, v25
+; SI-NEXT:    v_cvt_f32_f16_e32 v25, v58
+; SI-NEXT:    v_cvt_f32_f16_e32 v26, v46
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
 ; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v25
-; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v24
-; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v25
+; SI-NEXT:    v_cvt_f32_f16_e32 v25, v42
+; SI-NEXT:    v_add_f32_e32 v26, 0x38000000, v26
+; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
+; SI-NEXT:    v_cvt_f16_f32_e32 v46, v26
 ; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
-; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v48
-; SI-NEXT:    v_add_f32_e32 v39, 0x38000000, v39
-; SI-NEXT:    v_add_f32_e32 v32, 0x38000000, v32
-; SI-NEXT:    v_add_f32_e32 v31, 0x38000000, v31
-; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v15
-; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
-; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
-; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
-; SI-NEXT:    v_or_b32_e32 v38, v38, v50
-; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v36
-; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
-; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
 ; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
-; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-NEXT:    v_or_b32_e32 v49, v49, v50
-; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v17
-; SI-NEXT:    v_or_b32_e32 v18, v18, v50
-; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v19
-; SI-NEXT:    v_or_b32_e32 v23, v23, v50
-; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v41
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v40
-; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v55
-; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v53
-; SI-NEXT:    v_or_b32_e32 v22, v22, v50
-; SI-NEXT:    v_or_b32_e32 v25, v25, v30
-; SI-NEXT:    v_or_b32_e32 v24, v24, v29
-; SI-NEXT:    v_or_b32_e32 v21, v21, v41
-; SI-NEXT:    v_or_b32_e32 v16, v16, v28
-; SI-NEXT:    v_or_b32_e32 v48, v48, v54
-; SI-NEXT:    v_or_b32_e32 v39, v39, v42
-; SI-NEXT:    v_or_b32_e32 v32, v32, v52
-; SI-NEXT:    v_or_b32_e32 v31, v31, v51
-; SI-NEXT:    v_or_b32_e32 v15, v15, v43
-; SI-NEXT:    v_or_b32_e32 v8, v8, v27
-; SI-NEXT:    v_or_b32_e32 v7, v7, v26
-; SI-NEXT:    v_or_b32_e32 v6, v6, v44
-; SI-NEXT:    v_alignbit_b32 v40, v22, v30, 16
-; SI-NEXT:    v_alignbit_b32 v30, v23, v29, 16
-; SI-NEXT:    v_alignbit_b32 v29, v18, v41, 16
-; SI-NEXT:    v_alignbit_b32 v28, v49, v28, 16
-; SI-NEXT:    v_alignbit_b32 v55, v38, v54, 16
-; SI-NEXT:    v_alignbit_b32 v54, v34, v42, 16
-; SI-NEXT:    v_alignbit_b32 v53, v35, v52, 16
-; SI-NEXT:    v_alignbit_b32 v52, v14, v51, 16
-; SI-NEXT:    v_alignbit_b32 v51, v10, v43, 16
-; SI-NEXT:    v_alignbit_b32 v50, v11, v27, 16
-; SI-NEXT:    v_alignbit_b32 v27, v5, v26, 16
-; SI-NEXT:    v_alignbit_b32 v26, v2, v44, 16
+; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
+; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v25
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v42, v25
+; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v24
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v46
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; SI-NEXT:    v_or_b32_e32 v16, v16, v26
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v58
+; SI-NEXT:    v_lshr_b64 v[50:51], v[15:16], 16
+; SI-NEXT:    v_or_b32_e32 v20, v20, v25
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v42
+; SI-NEXT:    v_mov_b32_e32 v51, v29
+; SI-NEXT:    v_lshr_b64 v[29:30], v[21:22], 16
+; SI-NEXT:    v_or_b32_e32 v24, v24, v25
+; SI-NEXT:    v_lshr_b64 v[30:31], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[25:26], v[1:2], 16
+; SI-NEXT:    v_mov_b32_e32 v31, v44
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshr_b64 v[54:55], v[23:24], 16
+; SI-NEXT:    v_lshr_b64 v[52:53], v[19:20], 16
+; SI-NEXT:    v_lshr_b64 v[48:49], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[38:39], v[13:14], 16
+; SI-NEXT:    v_mov_b32_e32 v55, v35
+; SI-NEXT:    v_mov_b32_e32 v53, v32
+; SI-NEXT:    v_mov_b32_e32 v49, v28
+; SI-NEXT:    v_mov_b32_e32 v39, v27
+; SI-NEXT:    v_lshr_b64 v[36:37], v[11:12], 16
+; SI-NEXT:    v_mov_b32_e32 v11, v33
+; SI-NEXT:    v_lshr_b64 v[34:35], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[32:33], v[7:8], 16
+; SI-NEXT:    v_lshr_b64 v[27:28], v[3:4], 16
 ; SI-NEXT:  .LBB59_3: ; %end
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v40
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
-; SI-NEXT:    v_or_b32_e32 v25, v25, v40
-; SI-NEXT:    v_or_b32_e32 v20, v22, v20
-; SI-NEXT:    v_add_i32_e32 v22, vcc, 4, v0
-; SI-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v20, v22, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v54
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v62
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v30
-; SI-NEXT:    v_or_b32_e32 v20, v20, v22
-; SI-NEXT:    v_add_i32_e32 v22, vcc, 8, v0
-; SI-NEXT:    buffer_store_dword v20, v22, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v24
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v42
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 4, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v23
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; SI-NEXT:    v_or_b32_e32 v19, v20, v19
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 12, v0
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v52
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v59
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 8, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v21
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v29
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 16, v0
-; SI-NEXT:    v_or_b32_e32 v17, v18, v17
-; SI-NEXT:    v_add_i32_e32 v18, vcc, 20, v0
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v20
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v58
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v28
-; SI-NEXT:    v_or_b32_e32 v16, v16, v17
-; SI-NEXT:    v_add_i32_e32 v17, vcc, 24, v0
-; SI-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v50
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v57
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v49
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v36
-; SI-NEXT:    v_or_b32_e32 v16, v16, v17
-; SI-NEXT:    v_add_i32_e32 v17, vcc, 28, v0
-; SI-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v46
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt vmcnt(6) expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v26
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v29
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v48
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v55
-; SI-NEXT:    v_or_b32_e32 v16, v16, v17
-; SI-NEXT:    v_add_i32_e32 v17, vcc, 32, v0
-; SI-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v61
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v38
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v37
-; SI-NEXT:    v_or_b32_e32 v16, v16, v17
-; SI-NEXT:    v_add_i32_e32 v17, vcc, 36, v0
-; SI-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v55
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v48
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v39
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v54
-; SI-NEXT:    v_or_b32_e32 v16, v16, v17
-; SI-NEXT:    v_add_i32_e32 v17, vcc, 40, v0
-; SI-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v63
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 36, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v34
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v33
-; SI-NEXT:    v_or_b32_e32 v16, v16, v17
-; SI-NEXT:    v_add_i32_e32 v17, vcc, 44, v0
-; SI-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v44
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v38
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 40, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v32
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v53
-; SI-NEXT:    v_or_b32_e32 v16, v16, v17
-; SI-NEXT:    v_add_i32_e32 v17, vcc, 48, v0
-; SI-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v60
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 44, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v35
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; SI-NEXT:    v_or_b32_e32 v12, v16, v12
-; SI-NEXT:    v_add_i32_e32 v16, vcc, 52, v0
-; SI-NEXT:    buffer_store_dword v12, v16, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v36
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v31
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v52
-; SI-NEXT:    v_or_b32_e32 v12, v12, v16
-; SI-NEXT:    v_add_i32_e32 v16, vcc, 56, v0
-; SI-NEXT:    buffer_store_dword v12, v16, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v12
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v47
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_or_b32_e32 v12, v12, v13
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 60, v0
-; SI-NEXT:    buffer_store_dword v12, v13, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v53
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v34
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v51
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_or_b32_e32 v12, v12, v13
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 64, v0
-; SI-NEXT:    v_or_b32_e32 v9, v10, v9
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 0x44, v0
-; SI-NEXT:    buffer_store_dword v12, v13, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v56
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v50
-; SI-NEXT:    v_or_b32_e32 v8, v8, v9
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 0x48, v0
-; SI-NEXT:    buffer_store_dword v8, v9, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v51
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v32
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 64, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; SI-NEXT:    v_or_b32_e32 v3, v8, v3
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x4c, v0
-; SI-NEXT:    buffer_store_dword v3, v8, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v27
-; SI-NEXT:    v_or_b32_e32 v3, v3, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x50, v0
-; SI-NEXT:    buffer_store_dword v3, v7, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v45
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x44, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_or_b32_e32 v3, v3, v4
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x54, v0
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v49
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v30
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x48, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v26
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v3, v3, v4
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x58, v0
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v40
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v39
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v27
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x50, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v41
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x54, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v31
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v25
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x58, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v43
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x5c, v0
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB59_4:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
index ed67c6b397f11..29005a42d8860 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
@@ -3408,313 +3408,333 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3
 ; SI-LABEL: bitcast_v26i32_to_v52i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_writelane_b32 v14, s30, 0
+; SI-NEXT:    v_writelane_b32 v14, s31, 1
+; SI-NEXT:    v_writelane_b32 v14, s34, 2
+; SI-NEXT:    v_writelane_b32 v14, s35, 3
+; SI-NEXT:    v_writelane_b32 v14, s36, 4
+; SI-NEXT:    v_writelane_b32 v14, s37, 5
+; SI-NEXT:    v_writelane_b32 v14, s38, 6
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; SI-NEXT:    v_readfirstlane_b32 s41, v1
-; SI-NEXT:    v_readfirstlane_b32 s40, v2
-; SI-NEXT:    v_readfirstlane_b32 s15, v3
-; SI-NEXT:    v_readfirstlane_b32 s14, v4
-; SI-NEXT:    v_readfirstlane_b32 s13, v5
-; SI-NEXT:    v_readfirstlane_b32 s12, v6
-; SI-NEXT:    v_readfirstlane_b32 s11, v7
-; SI-NEXT:    v_readfirstlane_b32 s10, v8
-; SI-NEXT:    v_readfirstlane_b32 s9, v9
-; SI-NEXT:    v_readfirstlane_b32 s8, v10
-; SI-NEXT:    v_readfirstlane_b32 s7, v11
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_readfirstlane_b32 s6, v12
+; SI-NEXT:    v_writelane_b32 v14, s39, 7
+; SI-NEXT:    v_readfirstlane_b32 s14, v1
+; SI-NEXT:    v_readfirstlane_b32 s15, v2
+; SI-NEXT:    v_readfirstlane_b32 s12, v3
+; SI-NEXT:    v_readfirstlane_b32 s13, v4
+; SI-NEXT:    v_readfirstlane_b32 s10, v5
+; SI-NEXT:    v_readfirstlane_b32 s11, v6
+; SI-NEXT:    v_readfirstlane_b32 s8, v7
+; SI-NEXT:    v_readfirstlane_b32 s9, v8
+; SI-NEXT:    v_readfirstlane_b32 s6, v9
+; SI-NEXT:    v_readfirstlane_b32 s7, v10
+; SI-NEXT:    v_readfirstlane_b32 s4, v11
+; SI-NEXT:    s_and_b64 s[40:41], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s5, v12
+; SI-NEXT:    v_writelane_b32 v14, s48, 8
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s13
-; SI-NEXT:    v_mov_b32_e32 v5, s15
-; SI-NEXT:    v_mov_b32_e32 v6, s41
-; SI-NEXT:    v_mov_b32_e32 v7, s28
-; SI-NEXT:    v_mov_b32_e32 v8, s26
-; SI-NEXT:    v_mov_b32_e32 v9, s24
-; SI-NEXT:    v_mov_b32_e32 v10, s22
-; SI-NEXT:    v_mov_b32_e32 v11, s20
-; SI-NEXT:    v_mov_b32_e32 v12, s18
-; SI-NEXT:    v_mov_b32_e32 v13, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s10, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s12, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s14, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s40, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s29, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s27, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s25, v9, 16
-; SI-NEXT:    v_alignbit_b32 v10, s23, v10, 16
-; SI-NEXT:    v_alignbit_b32 v11, s21, v11, 16
-; SI-NEXT:    v_alignbit_b32 v12, s19, v12, 16
-; SI-NEXT:    v_alignbit_b32 v13, s17, v13, 16
-; SI-NEXT:    s_lshr_b32 s42, s6, 16
-; SI-NEXT:    s_lshr_b32 s43, s8, 16
-; SI-NEXT:    s_lshr_b32 s44, s10, 16
-; SI-NEXT:    s_lshr_b32 s45, s12, 16
-; SI-NEXT:    s_lshr_b32 s46, s14, 16
-; SI-NEXT:    s_lshr_b32 s47, s40, 16
-; SI-NEXT:    s_lshr_b32 s56, s29, 16
-; SI-NEXT:    s_lshr_b32 s57, s27, 16
-; SI-NEXT:    s_lshr_b32 s58, s25, 16
-; SI-NEXT:    s_lshr_b32 s59, s23, 16
-; SI-NEXT:    s_lshr_b32 s60, s21, 16
-; SI-NEXT:    s_lshr_b32 s61, s19, 16
-; SI-NEXT:    s_lshr_b32 s62, s17, 16
+; SI-NEXT:    s_lshr_b32 s92, s5, 16
+; SI-NEXT:    s_lshr_b32 s93, s7, 16
+; SI-NEXT:    s_lshr_b32 s94, s9, 16
+; SI-NEXT:    s_lshr_b32 s95, s11, 16
+; SI-NEXT:    s_lshr_b32 s30, s13, 16
+; SI-NEXT:    s_lshr_b32 s31, s15, 16
+; SI-NEXT:    s_lshr_b32 s34, s29, 16
+; SI-NEXT:    s_lshr_b32 s35, s27, 16
+; SI-NEXT:    s_lshr_b32 s36, s25, 16
+; SI-NEXT:    s_lshr_b32 s37, s23, 16
+; SI-NEXT:    s_lshr_b32 s38, s21, 16
+; SI-NEXT:    s_lshr_b32 s39, s19, 16
+; SI-NEXT:    s_lshr_b32 s48, s17, 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[14:15], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[72:73], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[74:75], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[78:79], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[88:89], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB13_3
 ; SI-NEXT:  .LBB13_2: ; %cmp.true
+; SI-NEXT:    s_add_i32 s17, s17, 3
 ; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    s_add_i32 s19, s19, 3
 ; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_add_i32 s21, s21, 3
 ; SI-NEXT:    s_add_i32 s20, s20, 3
+; SI-NEXT:    s_add_i32 s23, s23, 3
 ; SI-NEXT:    s_add_i32 s22, s22, 3
+; SI-NEXT:    s_add_i32 s25, s25, 3
 ; SI-NEXT:    s_add_i32 s24, s24, 3
+; SI-NEXT:    s_add_i32 s27, s27, 3
 ; SI-NEXT:    s_add_i32 s26, s26, 3
+; SI-NEXT:    s_add_i32 s29, s29, 3
 ; SI-NEXT:    s_add_i32 s28, s28, 3
-; SI-NEXT:    s_add_i32 s41, s41, 3
 ; SI-NEXT:    s_add_i32 s15, s15, 3
-; SI-NEXT:    s_add_i32 s13, s13, 3
-; SI-NEXT:    s_add_i32 s11, s11, 3
-; SI-NEXT:    s_add_i32 s9, s9, 3
-; SI-NEXT:    s_add_i32 s7, s7, 3
-; SI-NEXT:    s_add_i32 s17, s17, 3
-; SI-NEXT:    s_add_i32 s19, s19, 3
-; SI-NEXT:    s_add_i32 s21, s21, 3
-; SI-NEXT:    s_add_i32 s23, s23, 3
-; SI-NEXT:    s_add_i32 s25, s25, 3
-; SI-NEXT:    s_add_i32 s27, s27, 3
-; SI-NEXT:    s_add_i32 s29, s29, 3
-; SI-NEXT:    s_add_i32 s40, s40, 3
 ; SI-NEXT:    s_add_i32 s14, s14, 3
+; SI-NEXT:    s_add_i32 s13, s13, 3
 ; SI-NEXT:    s_add_i32 s12, s12, 3
+; SI-NEXT:    s_add_i32 s11, s11, 3
 ; SI-NEXT:    s_add_i32 s10, s10, 3
+; SI-NEXT:    s_add_i32 s9, s9, 3
 ; SI-NEXT:    s_add_i32 s8, s8, 3
+; SI-NEXT:    s_add_i32 s7, s7, 3
 ; SI-NEXT:    s_add_i32 s6, s6, 3
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s13
-; SI-NEXT:    v_mov_b32_e32 v5, s15
-; SI-NEXT:    v_mov_b32_e32 v6, s41
-; SI-NEXT:    v_mov_b32_e32 v7, s28
-; SI-NEXT:    v_mov_b32_e32 v8, s26
-; SI-NEXT:    v_mov_b32_e32 v9, s24
-; SI-NEXT:    v_mov_b32_e32 v10, s22
-; SI-NEXT:    v_mov_b32_e32 v11, s20
-; SI-NEXT:    v_mov_b32_e32 v12, s18
-; SI-NEXT:    v_mov_b32_e32 v13, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s10, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s12, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s14, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s40, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s29, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s27, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s25, v9, 16
-; SI-NEXT:    v_alignbit_b32 v10, s23, v10, 16
-; SI-NEXT:    v_alignbit_b32 v11, s21, v11, 16
-; SI-NEXT:    v_alignbit_b32 v12, s19, v12, 16
-; SI-NEXT:    v_alignbit_b32 v13, s17, v13, 16
-; SI-NEXT:    s_lshr_b32 s42, s6, 16
-; SI-NEXT:    s_lshr_b32 s43, s8, 16
-; SI-NEXT:    s_lshr_b32 s44, s10, 16
-; SI-NEXT:    s_lshr_b32 s45, s12, 16
-; SI-NEXT:    s_lshr_b32 s46, s14, 16
-; SI-NEXT:    s_lshr_b32 s47, s40, 16
-; SI-NEXT:    s_lshr_b32 s56, s29, 16
-; SI-NEXT:    s_lshr_b32 s57, s27, 16
-; SI-NEXT:    s_lshr_b32 s58, s25, 16
-; SI-NEXT:    s_lshr_b32 s59, s23, 16
-; SI-NEXT:    s_lshr_b32 s60, s21, 16
-; SI-NEXT:    s_lshr_b32 s61, s19, 16
-; SI-NEXT:    s_lshr_b32 s62, s17, 16
+; SI-NEXT:    s_add_i32 s5, s5, 3
+; SI-NEXT:    s_add_i32 s4, s4, 3
+; SI-NEXT:    s_lshr_b64 s[40:41], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[14:15], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[28:29], 16
+; SI-NEXT:    s_lshr_b32 s92, s5, 16
+; SI-NEXT:    s_lshr_b32 s93, s7, 16
+; SI-NEXT:    s_lshr_b32 s94, s9, 16
+; SI-NEXT:    s_lshr_b32 s95, s11, 16
+; SI-NEXT:    s_lshr_b32 s30, s13, 16
+; SI-NEXT:    s_lshr_b32 s31, s15, 16
+; SI-NEXT:    s_lshr_b32 s34, s29, 16
+; SI-NEXT:    s_lshr_b32 s35, s27, 16
+; SI-NEXT:    s_lshr_b32 s36, s25, 16
+; SI-NEXT:    s_lshr_b32 s37, s23, 16
+; SI-NEXT:    s_lshr_b32 s38, s21, 16
+; SI-NEXT:    s_lshr_b32 s39, s19, 16
+; SI-NEXT:    s_lshr_b32 s48, s17, 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[72:73], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[74:75], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[78:79], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[88:89], s[16:17], 16
 ; SI-NEXT:  .LBB13_3: ; %end
-; SI-NEXT:    s_and_b32 s4, s16, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_or_b32_e32 v13, s4, v13
-; SI-NEXT:    s_and_b32 s4, s17, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s62, 16
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_mov_b32_e32 v14, s4
-; SI-NEXT:    s_and_b32 s4, s18, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; SI-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 4, v0
-; SI-NEXT:    v_or_b32_e32 v12, s4, v12
-; SI-NEXT:    s_and_b32 s4, s19, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s61, 16
-; SI-NEXT:    buffer_store_dword v14, v13, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 8, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v12, v13, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v13, s4
-; SI-NEXT:    s_and_b32 s4, s20, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT:    s_lshl_b32 s41, s88, 16
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s41
+; SI-NEXT:    v_mov_b32_e32 v1, s16
+; SI-NEXT:    s_and_b32 s16, s17, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s48, 16
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_lshl_b32 s16, s78, 16
+; SI-NEXT:    s_and_b32 s17, s18, 0xffff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_mov_b32_e32 v3, s16
+; SI-NEXT:    s_and_b32 s16, s19, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s39, 16
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v4, s16
+; SI-NEXT:    s_lshl_b32 s16, s76, 16
+; SI-NEXT:    s_and_b32 s17, s20, 0xffff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 12, v0
-; SI-NEXT:    v_or_b32_e32 v11, s4, v11
-; SI-NEXT:    s_and_b32 s4, s21, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s60, 16
-; SI-NEXT:    buffer_store_dword v13, v12, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 16, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v11, v12, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v12, s4
-; SI-NEXT:    s_and_b32 s4, s22, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; SI-NEXT:    v_mov_b32_e32 v5, s16
+; SI-NEXT:    s_and_b32 s16, s21, 0xffff
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; SI-NEXT:    s_lshl_b32 s17, s38, 16
+; SI-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s22, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s74, 16
+; SI-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 20, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 20, v0
-; SI-NEXT:    v_or_b32_e32 v10, s4, v10
-; SI-NEXT:    s_and_b32 s4, s23, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s59, 16
-; SI-NEXT:    buffer_store_dword v12, v11, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 24, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v11, s4
-; SI-NEXT:    s_and_b32 s4, s24, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s23, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s37, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 24, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 28, v0
-; SI-NEXT:    v_or_b32_e32 v9, s4, v9
-; SI-NEXT:    s_and_b32 s4, s25, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s58, 16
-; SI-NEXT:    buffer_store_dword v11, v10, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 32, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v10, s4
-; SI-NEXT:    s_and_b32 s4, s26, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s24, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s72, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 36, v0
-; SI-NEXT:    v_or_b32_e32 v8, s4, v8
-; SI-NEXT:    s_and_b32 s4, s27, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s57, 16
-; SI-NEXT:    buffer_store_dword v10, v9, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 40, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v8, v9, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v9, s4
-; SI-NEXT:    s_and_b32 s4, s28, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s25, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s36, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 44, v0
-; SI-NEXT:    v_or_b32_e32 v7, s4, v7
-; SI-NEXT:    s_and_b32 s4, s29, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s56, 16
-; SI-NEXT:    buffer_store_dword v9, v8, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v8, s4
-; SI-NEXT:    s_and_b32 s4, s41, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s26, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s62, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 36, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 52, v0
-; SI-NEXT:    v_or_b32_e32 v6, s4, v6
-; SI-NEXT:    s_and_b32 s4, s40, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s47, 16
-; SI-NEXT:    buffer_store_dword v8, v7, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 56, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    s_and_b32 s4, s15, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s27, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s35, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 40, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 60, v0
-; SI-NEXT:    v_or_b32_e32 v5, s4, v5
-; SI-NEXT:    s_and_b32 s4, s14, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s46, 16
-; SI-NEXT:    buffer_store_dword v7, v6, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 64, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v6, s4
-; SI-NEXT:    s_and_b32 s4, s13, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s28, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s60, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 44, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x44, v0
-; SI-NEXT:    v_or_b32_e32 v4, s4, v4
-; SI-NEXT:    s_and_b32 s4, s12, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s45, 16
-; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v5, s4
-; SI-NEXT:    s_and_b32 s4, s11, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s29, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s34, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 48, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x4c, v0
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s10, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s44, 16
-; SI-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x50, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v4, s4
-; SI-NEXT:    s_and_b32 s4, s9, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s14, s14, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s58, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 52, v0
+; SI-NEXT:    s_or_b32 s14, s14, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x54, v0
-; SI-NEXT:    v_or_b32_e32 v2, s4, v2
-; SI-NEXT:    s_and_b32 s4, s8, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s43, 16
-; SI-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x58, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v3, s4
-; SI-NEXT:    s_and_b32 s4, s7, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_mov_b32_e32 v2, s14
+; SI-NEXT:    s_and_b32 s14, s15, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s31, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 56, v0
+; SI-NEXT:    s_or_b32 s14, s14, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
-; SI-NEXT:    v_or_b32_e32 v1, s4, v1
-; SI-NEXT:    s_and_b32 s4, s6, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s42, 16
-; SI-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s14
+; SI-NEXT:    s_and_b32 s12, s12, 0xffff
+; SI-NEXT:    s_lshl_b32 s14, s56, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 60, v0
+; SI-NEXT:    s_or_b32 s12, s12, s14
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    s_and_b32 s12, s13, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s30, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 64, v0
+; SI-NEXT:    s_or_b32 s12, s12, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_lshl_b32 s12, s46, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x44, v0
+; SI-NEXT:    s_or_b32 s10, s10, s12
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s10, s11, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s95, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x48, v0
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    s_lshl_b32 s10, s44, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x4c, v0
+; SI-NEXT:    s_or_b32 s8, s8, s10
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s8, s9, 0xffff
+; SI-NEXT:    s_lshl_b32 s9, s94, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x50, v0
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_lshl_b32 s8, s42, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x54, v0
+; SI-NEXT:    s_or_b32 s6, s6, s8
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s6, s7, 0xffff
+; SI-NEXT:    s_lshl_b32 s7, s93, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x58, v0
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; SI-NEXT:    s_lshl_b32 s6, s40, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x5c, v0
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    s_and_b32 s4, s5, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s92, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x60, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    v_readlane_b32 s48, v14, 8
+; SI-NEXT:    v_readlane_b32 s39, v14, 7
+; SI-NEXT:    v_readlane_b32 s38, v14, 6
+; SI-NEXT:    v_readlane_b32 s37, v14, 5
+; SI-NEXT:    v_readlane_b32 s36, v14, 4
+; SI-NEXT:    v_readlane_b32 s35, v14, 3
+; SI-NEXT:    v_readlane_b32 s34, v14, 2
+; SI-NEXT:    v_readlane_b32 s31, v14, 1
+; SI-NEXT:    v_readlane_b32 s30, v14, 0
+; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB13_4:
-; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr78
+; SI-NEXT:    ; implicit-def: $sgpr39
+; SI-NEXT:    ; implicit-def: $sgpr76
+; SI-NEXT:    ; implicit-def: $sgpr38
+; SI-NEXT:    ; implicit-def: $sgpr74
+; SI-NEXT:    ; implicit-def: $sgpr37
+; SI-NEXT:    ; implicit-def: $sgpr72
+; SI-NEXT:    ; implicit-def: $sgpr36
 ; SI-NEXT:    ; implicit-def: $sgpr62
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $sgpr61
-; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $sgpr35
 ; SI-NEXT:    ; implicit-def: $sgpr60
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $sgpr59
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $sgpr34
 ; SI-NEXT:    ; implicit-def: $sgpr58
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $sgpr57
-; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $sgpr31
 ; SI-NEXT:    ; implicit-def: $sgpr56
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $sgpr47
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $sgpr30
 ; SI-NEXT:    ; implicit-def: $sgpr46
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $sgpr45
-; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $sgpr95
 ; SI-NEXT:    ; implicit-def: $sgpr44
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $sgpr43
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr94
+; SI-NEXT:    ; implicit-def: $sgpr93
+; SI-NEXT:    ; implicit-def: $sgpr92
 ; SI-NEXT:    ; implicit-def: $sgpr42
+; SI-NEXT:    ; implicit-def: $sgpr40
 ; SI-NEXT:    s_branch .LBB13_2
 ;
 ; VI-LABEL: bitcast_v26i32_to_v52i16_scalar:
@@ -13646,211 +13666,217 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a,
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; SI-NEXT:    v_mov_b32_e32 v28, s16
-; SI-NEXT:    v_mov_b32_e32 v27, s17
-; SI-NEXT:    v_mov_b32_e32 v25, s18
+; SI-NEXT:    v_mov_b32_e32 v25, s16
+; SI-NEXT:    v_mov_b32_e32 v26, s17
+; SI-NEXT:    v_mov_b32_e32 v23, s18
 ; SI-NEXT:    v_mov_b32_e32 v24, s19
-; SI-NEXT:    v_mov_b32_e32 v21, s20
-; SI-NEXT:    v_mov_b32_e32 v19, s21
-; SI-NEXT:    v_mov_b32_e32 v22, s22
+; SI-NEXT:    v_mov_b32_e32 v19, s20
+; SI-NEXT:    v_mov_b32_e32 v20, s21
+; SI-NEXT:    v_mov_b32_e32 v21, s22
+; SI-NEXT:    v_mov_b32_e32 v22, s23
+; SI-NEXT:    v_mov_b32_e32 v17, s24
+; SI-NEXT:    v_mov_b32_e32 v18, s25
+; SI-NEXT:    v_mov_b32_e32 v15, s26
+; SI-NEXT:    v_mov_b32_e32 v16, s27
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mov_b32_e32 v20, s23
-; SI-NEXT:    v_mov_b32_e32 v18, s24
-; SI-NEXT:    v_mov_b32_e32 v17, s25
-; SI-NEXT:    v_mov_b32_e32 v16, s26
-; SI-NEXT:    v_mov_b32_e32 v15, s27
-; SI-NEXT:    v_mov_b32_e32 v14, s28
-; SI-NEXT:    v_mov_b32_e32 v13, s29
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v13, s28
+; SI-NEXT:    v_mov_b32_e32 v14, s29
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB29_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_alignbit_b32 v23, v12, v11, 16
-; SI-NEXT:    v_alignbit_b32 v26, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v29, v8, v7, 16
-; SI-NEXT:    v_alignbit_b32 v30, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v31, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v33, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v35, v13, v14, 16
-; SI-NEXT:    v_alignbit_b32 v37, v15, v16, 16
-; SI-NEXT:    v_alignbit_b32 v48, v17, v18, 16
-; SI-NEXT:    v_alignbit_b32 v50, v20, v22, 16
-; SI-NEXT:    v_alignbit_b32 v52, v19, v21, 16
-; SI-NEXT:    v_alignbit_b32 v54, v24, v25, 16
+; SI-NEXT:    v_lshr_b64 v[27:28], v[11:12], 16
+; SI-NEXT:    v_lshr_b64 v[28:29], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[29:30], v[7:8], 16
+; SI-NEXT:    v_lshr_b64 v[30:31], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[31:32], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[32:33], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[36:37], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[33:34], v[15:16], 16
+; SI-NEXT:    v_lshr_b64 v[37:38], v[21:22], 16
+; SI-NEXT:    v_lshr_b64 v[48:49], v[23:24], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v2
+; SI-NEXT:    s_waitcnt expcnt(6)
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v14
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v16
 ; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_alignbit_b32 v40, v27, v28, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v18
 ; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v22
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v20
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v24
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v27
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v26
+; SI-NEXT:    v_lshr_b64 v[34:35], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[38:39], v[19:20], 16
+; SI-NEXT:    v_lshr_b64 v[49:50], v[25:26], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB29_3
 ; SI-NEXT:  .LBB29_2: ; %cmp.true
-; SI-NEXT:    v_add_f32_e32 v27, 1.0, v27
-; SI-NEXT:    v_add_f32_e32 v28, 1.0, v28
+; SI-NEXT:    v_add_f32_e32 v12, 1.0, v12
+; SI-NEXT:    v_add_f32_e32 v11, 1.0, v11
+; SI-NEXT:    v_add_f32_e32 v10, 1.0, v10
+; SI-NEXT:    v_add_f32_e32 v9, 1.0, v9
+; SI-NEXT:    v_lshr_b64 v[27:28], v[11:12], 16
+; SI-NEXT:    v_add_f32_e32 v8, 1.0, v8
+; SI-NEXT:    v_add_f32_e32 v7, 1.0, v7
+; SI-NEXT:    v_lshr_b64 v[28:29], v[9:10], 16
+; SI-NEXT:    v_add_f32_e32 v6, 1.0, v6
+; SI-NEXT:    v_add_f32_e32 v5, 1.0, v5
+; SI-NEXT:    v_lshr_b64 v[29:30], v[7:8], 16
+; SI-NEXT:    v_add_f32_e32 v4, 1.0, v4
+; SI-NEXT:    v_add_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_lshr_b64 v[30:31], v[5:6], 16
+; SI-NEXT:    v_add_f32_e32 v14, 1.0, v14
+; SI-NEXT:    v_add_f32_e32 v13, 1.0, v13
+; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[31:32], v[3:4], 16
 ; SI-NEXT:    v_add_f32_e32 v24, 1.0, v24
-; SI-NEXT:    v_add_f32_e32 v25, 1.0, v25
-; SI-NEXT:    v_add_f32_e32 v19, 1.0, v19
+; SI-NEXT:    v_add_f32_e32 v23, 1.0, v23
+; SI-NEXT:    v_add_f32_e32 v22, 1.0, v22
 ; SI-NEXT:    v_add_f32_e32 v21, 1.0, v21
+; SI-NEXT:    v_add_f32_e32 v16, 1.0, v16
+; SI-NEXT:    v_add_f32_e32 v15, 1.0, v15
+; SI-NEXT:    v_lshr_b64 v[32:33], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[36:37], v[13:14], 16
+; SI-NEXT:    v_add_f32_e32 v26, 1.0, v26
+; SI-NEXT:    v_add_f32_e32 v25, 1.0, v25
 ; SI-NEXT:    v_add_f32_e32 v20, 1.0, v20
-; SI-NEXT:    v_add_f32_e32 v22, 1.0, v22
-; SI-NEXT:    v_add_f32_e32 v17, 1.0, v17
+; SI-NEXT:    v_add_f32_e32 v19, 1.0, v19
 ; SI-NEXT:    v_add_f32_e32 v18, 1.0, v18
-; SI-NEXT:    v_add_f32_e32 v15, 1.0, v15
-; SI-NEXT:    v_add_f32_e32 v16, 1.0, v16
-; SI-NEXT:    v_add_f32_e32 v13, 1.0, v13
-; SI-NEXT:    v_add_f32_e32 v14, 1.0, v14
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_add_f32_e32 v4, 1.0, v4
-; SI-NEXT:    v_add_f32_e32 v3, 1.0, v3
-; SI-NEXT:    v_add_f32_e32 v6, 1.0, v6
-; SI-NEXT:    v_add_f32_e32 v5, 1.0, v5
-; SI-NEXT:    v_add_f32_e32 v8, 1.0, v8
-; SI-NEXT:    v_add_f32_e32 v7, 1.0, v7
-; SI-NEXT:    v_add_f32_e32 v10, 1.0, v10
-; SI-NEXT:    v_add_f32_e32 v9, 1.0, v9
-; SI-NEXT:    v_add_f32_e32 v12, 1.0, v12
-; SI-NEXT:    v_add_f32_e32 v11, 1.0, v11
-; SI-NEXT:    v_alignbit_b32 v23, v12, v11, 16
-; SI-NEXT:    v_alignbit_b32 v26, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v29, v8, v7, 16
-; SI-NEXT:    v_alignbit_b32 v30, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v31, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v33, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v35, v13, v14, 16
-; SI-NEXT:    v_alignbit_b32 v37, v15, v16, 16
-; SI-NEXT:    v_alignbit_b32 v48, v17, v18, 16
-; SI-NEXT:    v_alignbit_b32 v50, v20, v22, 16
-; SI-NEXT:    v_alignbit_b32 v52, v19, v21, 16
-; SI-NEXT:    v_alignbit_b32 v54, v24, v25, 16
+; SI-NEXT:    v_add_f32_e32 v17, 1.0, v17
+; SI-NEXT:    v_lshr_b64 v[33:34], v[15:16], 16
+; SI-NEXT:    v_lshr_b64 v[37:38], v[21:22], 16
+; SI-NEXT:    v_lshr_b64 v[48:49], v[23:24], 16
+; SI-NEXT:    v_lshr_b64 v[34:35], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[38:39], v[19:20], 16
+; SI-NEXT:    v_lshr_b64 v[49:50], v[25:26], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v2
+; SI-NEXT:    s_waitcnt expcnt(6)
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v14
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v16
 ; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_alignbit_b32 v40, v27, v28, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v34, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v13
-; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v18
 ; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v22
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v20
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v24
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v27
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v26
 ; SI-NEXT:  .LBB29_3: ; %end
-; SI-NEXT:    v_and_b32_e32 v28, 0xffff, v28
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v40
-; SI-NEXT:    v_or_b32_e32 v28, v28, v40
-; SI-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v44
-; SI-NEXT:    v_or_b32_e32 v27, v27, v28
-; SI-NEXT:    v_add_i32_e32 v28, vcc, 4, v0
-; SI-NEXT:    buffer_store_dword v27, v28, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v49
 ; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; SI-NEXT:    v_or_b32_e32 v25, v25, v35
+; SI-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v54
-; SI-NEXT:    v_or_b32_e32 v25, v25, v27
-; SI-NEXT:    v_add_i32_e32 v27, vcc, 8, v0
-; SI-NEXT:    buffer_store_dword v25, v27, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v26
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v47
+; SI-NEXT:    v_or_b32_e32 v25, v25, v26
+; SI-NEXT:    v_add_i32_e32 v26, vcc, 4, v0
+; SI-NEXT:    buffer_store_dword v25, v26, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v43
-; SI-NEXT:    v_or_b32_e32 v24, v24, v25
-; SI-NEXT:    v_add_i32_e32 v25, vcc, 12, v0
-; SI-NEXT:    buffer_store_dword v24, v25, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v48
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; SI-NEXT:    v_or_b32_e32 v23, v23, v25
+; SI-NEXT:    v_add_i32_e32 v25, vcc, 8, v0
+; SI-NEXT:    buffer_store_dword v23, v25, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v52
-; SI-NEXT:    v_or_b32_e32 v21, v21, v24
-; SI-NEXT:    v_add_i32_e32 v24, vcc, 16, v0
-; SI-NEXT:    buffer_store_dword v21, v24, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v24
+; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v46
+; SI-NEXT:    v_or_b32_e32 v23, v23, v24
+; SI-NEXT:    v_add_i32_e32 v24, vcc, 12, v0
+; SI-NEXT:    buffer_store_dword v23, v24, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v42
-; SI-NEXT:    v_or_b32_e32 v19, v19, v21
-; SI-NEXT:    v_add_i32_e32 v21, vcc, 20, v0
-; SI-NEXT:    buffer_store_dword v19, v21, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v50
-; SI-NEXT:    v_or_b32_e32 v19, v19, v21
-; SI-NEXT:    v_add_i32_e32 v21, vcc, 24, v0
-; SI-NEXT:    buffer_store_dword v19, v21, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v38
+; SI-NEXT:    v_or_b32_e32 v19, v19, v23
+; SI-NEXT:    v_add_i32_e32 v23, vcc, 16, v0
+; SI-NEXT:    buffer_store_dword v19, v23, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v41
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v45
+; SI-NEXT:    v_or_b32_e32 v19, v19, v20
+; SI-NEXT:    v_add_i32_e32 v20, vcc, 20, v0
+; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v21
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v37
+; SI-NEXT:    v_or_b32_e32 v19, v19, v20
+; SI-NEXT:    v_add_i32_e32 v20, vcc, 24, v0
+; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v44
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v20
 ; SI-NEXT:    v_add_i32_e32 v20, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v48
-; SI-NEXT:    v_or_b32_e32 v18, v18, v19
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v34
+; SI-NEXT:    v_or_b32_e32 v17, v17, v19
 ; SI-NEXT:    v_add_i32_e32 v19, vcc, 32, v0
-; SI-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; SI-NEXT:    buffer_store_dword v17, v19, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v55
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v43
 ; SI-NEXT:    v_or_b32_e32 v17, v17, v18
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v37
-; SI-NEXT:    v_or_b32_e32 v16, v16, v17
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v33
+; SI-NEXT:    v_or_b32_e32 v15, v15, v17
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, 40, v0
-; SI-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; SI-NEXT:    buffer_store_dword v15, v17, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v53
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v42
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v16
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v14
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v35
-; SI-NEXT:    v_or_b32_e32 v14, v14, v15
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v36
+; SI-NEXT:    v_or_b32_e32 v13, v13, v15
 ; SI-NEXT:    v_add_i32_e32 v15, vcc, 48, v0
-; SI-NEXT:    buffer_store_dword v14, v15, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
+; SI-NEXT:    buffer_store_dword v13, v15, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v51
+; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v41
 ; SI-NEXT:    v_or_b32_e32 v13, v13, v14
 ; SI-NEXT:    v_add_i32_e32 v14, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v33
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v32
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v13
 ; SI-NEXT:    v_add_i32_e32 v13, vcc, 56, v0
 ; SI-NEXT:    buffer_store_dword v1, v13, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v49
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -13862,7 +13888,7 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v39
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v55
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -13874,7 +13900,7 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v38
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v54
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -13886,68 +13912,71 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v36
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v53
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v26
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v28
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v52
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v23
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v27
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v32
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v51
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB29_4:
-; SI-NEXT:    ; implicit-def: $vgpr40
+; SI-NEXT:    ; implicit-def: $vgpr49
+; SI-NEXT:    ; implicit-def: $vgpr47
+; SI-NEXT:    ; implicit-def: $vgpr48
+; SI-NEXT:    ; implicit-def: $vgpr46
+; SI-NEXT:    ; implicit-def: $vgpr38
+; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; implicit-def: $vgpr44
-; SI-NEXT:    ; implicit-def: $vgpr54
+; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr43
-; SI-NEXT:    ; implicit-def: $vgpr52
+; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    ; implicit-def: $vgpr42
-; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; implicit-def: $vgpr41
-; SI-NEXT:    ; implicit-def: $vgpr48
+; SI-NEXT:    ; implicit-def: $vgpr40
 ; SI-NEXT:    ; implicit-def: $vgpr55
-; SI-NEXT:    ; implicit-def: $vgpr37
+; SI-NEXT:    ; implicit-def: $vgpr54
 ; SI-NEXT:    ; implicit-def: $vgpr53
-; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    ; implicit-def: $vgpr52
 ; SI-NEXT:    ; implicit-def: $vgpr51
-; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; implicit-def: $vgpr49
+; SI-NEXT:    ; implicit-def: $vgpr36
+; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    ; implicit-def: $vgpr38
 ; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr34
-; SI-NEXT:    ; implicit-def: $vgpr23
-; SI-NEXT:    ; implicit-def: $vgpr32
+; SI-NEXT:    ; implicit-def: $vgpr28
+; SI-NEXT:    ; implicit-def: $vgpr27
 ; SI-NEXT:    s_branch .LBB29_2
 ;
 ; VI-LABEL: bitcast_v26f32_to_v52i16_scalar:
@@ -23073,313 +23102,333 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3
 ; SI-LABEL: bitcast_v13i64_to_v52i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_writelane_b32 v14, s30, 0
+; SI-NEXT:    v_writelane_b32 v14, s31, 1
+; SI-NEXT:    v_writelane_b32 v14, s34, 2
+; SI-NEXT:    v_writelane_b32 v14, s35, 3
+; SI-NEXT:    v_writelane_b32 v14, s36, 4
+; SI-NEXT:    v_writelane_b32 v14, s37, 5
+; SI-NEXT:    v_writelane_b32 v14, s38, 6
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; SI-NEXT:    v_readfirstlane_b32 s41, v1
-; SI-NEXT:    v_readfirstlane_b32 s40, v2
-; SI-NEXT:    v_readfirstlane_b32 s15, v3
-; SI-NEXT:    v_readfirstlane_b32 s14, v4
-; SI-NEXT:    v_readfirstlane_b32 s13, v5
-; SI-NEXT:    v_readfirstlane_b32 s12, v6
-; SI-NEXT:    v_readfirstlane_b32 s11, v7
-; SI-NEXT:    v_readfirstlane_b32 s10, v8
-; SI-NEXT:    v_readfirstlane_b32 s9, v9
-; SI-NEXT:    v_readfirstlane_b32 s8, v10
-; SI-NEXT:    v_readfirstlane_b32 s7, v11
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_readfirstlane_b32 s6, v12
+; SI-NEXT:    v_writelane_b32 v14, s39, 7
+; SI-NEXT:    v_readfirstlane_b32 s14, v1
+; SI-NEXT:    v_readfirstlane_b32 s15, v2
+; SI-NEXT:    v_readfirstlane_b32 s12, v3
+; SI-NEXT:    v_readfirstlane_b32 s13, v4
+; SI-NEXT:    v_readfirstlane_b32 s10, v5
+; SI-NEXT:    v_readfirstlane_b32 s11, v6
+; SI-NEXT:    v_readfirstlane_b32 s8, v7
+; SI-NEXT:    v_readfirstlane_b32 s9, v8
+; SI-NEXT:    v_readfirstlane_b32 s6, v9
+; SI-NEXT:    v_readfirstlane_b32 s7, v10
+; SI-NEXT:    v_readfirstlane_b32 s4, v11
+; SI-NEXT:    s_and_b64 s[40:41], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s5, v12
+; SI-NEXT:    v_writelane_b32 v14, s48, 8
 ; SI-NEXT:    s_cbranch_scc0 .LBB41_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s13
-; SI-NEXT:    v_mov_b32_e32 v5, s15
-; SI-NEXT:    v_mov_b32_e32 v6, s41
-; SI-NEXT:    v_mov_b32_e32 v7, s28
-; SI-NEXT:    v_mov_b32_e32 v8, s26
-; SI-NEXT:    v_mov_b32_e32 v9, s24
-; SI-NEXT:    v_mov_b32_e32 v10, s22
-; SI-NEXT:    v_mov_b32_e32 v11, s20
-; SI-NEXT:    v_mov_b32_e32 v12, s18
-; SI-NEXT:    v_mov_b32_e32 v13, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s10, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s12, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s14, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s40, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s29, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s27, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s25, v9, 16
-; SI-NEXT:    v_alignbit_b32 v10, s23, v10, 16
-; SI-NEXT:    v_alignbit_b32 v11, s21, v11, 16
-; SI-NEXT:    v_alignbit_b32 v12, s19, v12, 16
-; SI-NEXT:    v_alignbit_b32 v13, s17, v13, 16
-; SI-NEXT:    s_lshr_b32 s42, s6, 16
-; SI-NEXT:    s_lshr_b32 s43, s8, 16
-; SI-NEXT:    s_lshr_b32 s44, s10, 16
-; SI-NEXT:    s_lshr_b32 s45, s12, 16
-; SI-NEXT:    s_lshr_b32 s46, s14, 16
-; SI-NEXT:    s_lshr_b32 s47, s40, 16
-; SI-NEXT:    s_lshr_b32 s56, s29, 16
-; SI-NEXT:    s_lshr_b32 s57, s27, 16
-; SI-NEXT:    s_lshr_b32 s58, s25, 16
-; SI-NEXT:    s_lshr_b32 s59, s23, 16
-; SI-NEXT:    s_lshr_b32 s60, s21, 16
-; SI-NEXT:    s_lshr_b32 s61, s19, 16
-; SI-NEXT:    s_lshr_b32 s62, s17, 16
+; SI-NEXT:    s_lshr_b32 s92, s5, 16
+; SI-NEXT:    s_lshr_b32 s93, s7, 16
+; SI-NEXT:    s_lshr_b32 s94, s9, 16
+; SI-NEXT:    s_lshr_b32 s95, s11, 16
+; SI-NEXT:    s_lshr_b32 s30, s13, 16
+; SI-NEXT:    s_lshr_b32 s31, s15, 16
+; SI-NEXT:    s_lshr_b32 s34, s29, 16
+; SI-NEXT:    s_lshr_b32 s35, s27, 16
+; SI-NEXT:    s_lshr_b32 s36, s25, 16
+; SI-NEXT:    s_lshr_b32 s37, s23, 16
+; SI-NEXT:    s_lshr_b32 s38, s21, 16
+; SI-NEXT:    s_lshr_b32 s39, s19, 16
+; SI-NEXT:    s_lshr_b32 s48, s17, 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[14:15], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[72:73], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[74:75], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[78:79], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[88:89], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB41_3
 ; SI-NEXT:  .LBB41_2: ; %cmp.true
-; SI-NEXT:    s_add_u32 s16, s16, 3
-; SI-NEXT:    s_addc_u32 s17, s17, 0
-; SI-NEXT:    s_add_u32 s18, s18, 3
-; SI-NEXT:    s_addc_u32 s19, s19, 0
-; SI-NEXT:    s_add_u32 s20, s20, 3
-; SI-NEXT:    s_addc_u32 s21, s21, 0
-; SI-NEXT:    s_add_u32 s22, s22, 3
-; SI-NEXT:    s_addc_u32 s23, s23, 0
-; SI-NEXT:    s_add_u32 s24, s24, 3
-; SI-NEXT:    s_addc_u32 s25, s25, 0
-; SI-NEXT:    s_add_u32 s26, s26, 3
-; SI-NEXT:    s_addc_u32 s27, s27, 0
+; SI-NEXT:    s_add_u32 s4, s4, 3
+; SI-NEXT:    s_addc_u32 s5, s5, 0
+; SI-NEXT:    s_add_u32 s6, s6, 3
+; SI-NEXT:    s_addc_u32 s7, s7, 0
+; SI-NEXT:    s_add_u32 s8, s8, 3
+; SI-NEXT:    s_addc_u32 s9, s9, 0
+; SI-NEXT:    s_add_u32 s10, s10, 3
+; SI-NEXT:    s_addc_u32 s11, s11, 0
+; SI-NEXT:    s_add_u32 s12, s12, 3
+; SI-NEXT:    s_addc_u32 s13, s13, 0
+; SI-NEXT:    s_add_u32 s14, s14, 3
+; SI-NEXT:    s_addc_u32 s15, s15, 0
 ; SI-NEXT:    s_add_u32 s28, s28, 3
 ; SI-NEXT:    s_addc_u32 s29, s29, 0
-; SI-NEXT:    s_add_u32 s41, s41, 3
-; SI-NEXT:    s_addc_u32 s40, s40, 0
-; SI-NEXT:    s_add_u32 s15, s15, 3
-; SI-NEXT:    s_addc_u32 s14, s14, 0
-; SI-NEXT:    s_add_u32 s13, s13, 3
-; SI-NEXT:    s_addc_u32 s12, s12, 0
-; SI-NEXT:    s_add_u32 s11, s11, 3
-; SI-NEXT:    s_addc_u32 s10, s10, 0
-; SI-NEXT:    s_add_u32 s9, s9, 3
-; SI-NEXT:    s_addc_u32 s8, s8, 0
-; SI-NEXT:    s_add_u32 s7, s7, 3
-; SI-NEXT:    s_addc_u32 s6, s6, 0
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s13
-; SI-NEXT:    v_mov_b32_e32 v5, s15
-; SI-NEXT:    v_mov_b32_e32 v6, s41
-; SI-NEXT:    v_mov_b32_e32 v7, s28
-; SI-NEXT:    v_mov_b32_e32 v8, s26
-; SI-NEXT:    v_mov_b32_e32 v9, s24
-; SI-NEXT:    v_mov_b32_e32 v10, s22
-; SI-NEXT:    v_mov_b32_e32 v11, s20
-; SI-NEXT:    v_mov_b32_e32 v12, s18
-; SI-NEXT:    v_mov_b32_e32 v13, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s10, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s12, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s14, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s40, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s29, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s27, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s25, v9, 16
-; SI-NEXT:    v_alignbit_b32 v10, s23, v10, 16
-; SI-NEXT:    v_alignbit_b32 v11, s21, v11, 16
-; SI-NEXT:    v_alignbit_b32 v12, s19, v12, 16
-; SI-NEXT:    v_alignbit_b32 v13, s17, v13, 16
-; SI-NEXT:    s_lshr_b32 s42, s6, 16
-; SI-NEXT:    s_lshr_b32 s43, s8, 16
-; SI-NEXT:    s_lshr_b32 s44, s10, 16
-; SI-NEXT:    s_lshr_b32 s45, s12, 16
-; SI-NEXT:    s_lshr_b32 s46, s14, 16
-; SI-NEXT:    s_lshr_b32 s47, s40, 16
-; SI-NEXT:    s_lshr_b32 s56, s29, 16
-; SI-NEXT:    s_lshr_b32 s57, s27, 16
-; SI-NEXT:    s_lshr_b32 s58, s25, 16
-; SI-NEXT:    s_lshr_b32 s59, s23, 16
-; SI-NEXT:    s_lshr_b32 s60, s21, 16
-; SI-NEXT:    s_lshr_b32 s61, s19, 16
-; SI-NEXT:    s_lshr_b32 s62, s17, 16
+; SI-NEXT:    s_add_u32 s26, s26, 3
+; SI-NEXT:    s_addc_u32 s27, s27, 0
+; SI-NEXT:    s_add_u32 s24, s24, 3
+; SI-NEXT:    s_addc_u32 s25, s25, 0
+; SI-NEXT:    s_add_u32 s22, s22, 3
+; SI-NEXT:    s_addc_u32 s23, s23, 0
+; SI-NEXT:    s_add_u32 s20, s20, 3
+; SI-NEXT:    s_addc_u32 s21, s21, 0
+; SI-NEXT:    s_add_u32 s18, s18, 3
+; SI-NEXT:    s_addc_u32 s19, s19, 0
+; SI-NEXT:    s_add_u32 s16, s16, 3
+; SI-NEXT:    s_addc_u32 s17, s17, 0
+; SI-NEXT:    s_lshr_b32 s92, s5, 16
+; SI-NEXT:    s_lshr_b32 s93, s7, 16
+; SI-NEXT:    s_lshr_b32 s94, s9, 16
+; SI-NEXT:    s_lshr_b32 s95, s11, 16
+; SI-NEXT:    s_lshr_b32 s30, s13, 16
+; SI-NEXT:    s_lshr_b32 s31, s15, 16
+; SI-NEXT:    s_lshr_b32 s34, s29, 16
+; SI-NEXT:    s_lshr_b32 s35, s27, 16
+; SI-NEXT:    s_lshr_b32 s36, s25, 16
+; SI-NEXT:    s_lshr_b32 s37, s23, 16
+; SI-NEXT:    s_lshr_b32 s38, s21, 16
+; SI-NEXT:    s_lshr_b32 s39, s19, 16
+; SI-NEXT:    s_lshr_b32 s48, s17, 16
+; SI-NEXT:    s_lshr_b64 s[40:41], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[14:15], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[72:73], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[74:75], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[78:79], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[88:89], s[16:17], 16
 ; SI-NEXT:  .LBB41_3: ; %end
-; SI-NEXT:    s_and_b32 s4, s16, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    v_or_b32_e32 v13, s4, v13
-; SI-NEXT:    s_and_b32 s4, s17, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s62, 16
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_mov_b32_e32 v14, s4
-; SI-NEXT:    s_and_b32 s4, s18, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; SI-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 4, v0
-; SI-NEXT:    v_or_b32_e32 v12, s4, v12
-; SI-NEXT:    s_and_b32 s4, s19, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s61, 16
-; SI-NEXT:    buffer_store_dword v14, v13, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 8, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v12, v13, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v13, s4
-; SI-NEXT:    s_and_b32 s4, s20, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT:    s_lshl_b32 s41, s88, 16
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s41
+; SI-NEXT:    v_mov_b32_e32 v1, s16
+; SI-NEXT:    s_and_b32 s16, s17, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s48, 16
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_lshl_b32 s16, s78, 16
+; SI-NEXT:    s_and_b32 s17, s18, 0xffff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_mov_b32_e32 v3, s16
+; SI-NEXT:    s_and_b32 s16, s19, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s39, 16
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v4, s16
+; SI-NEXT:    s_lshl_b32 s16, s76, 16
+; SI-NEXT:    s_and_b32 s17, s20, 0xffff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 12, v0
-; SI-NEXT:    v_or_b32_e32 v11, s4, v11
-; SI-NEXT:    s_and_b32 s4, s21, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s60, 16
-; SI-NEXT:    buffer_store_dword v13, v12, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 16, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v11, v12, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v12, s4
-; SI-NEXT:    s_and_b32 s4, s22, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; SI-NEXT:    v_mov_b32_e32 v5, s16
+; SI-NEXT:    s_and_b32 s16, s21, 0xffff
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; SI-NEXT:    s_lshl_b32 s17, s38, 16
+; SI-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; SI-NEXT:    s_waitcnt expcnt(2)
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s22, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s74, 16
+; SI-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 20, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 20, v0
-; SI-NEXT:    v_or_b32_e32 v10, s4, v10
-; SI-NEXT:    s_and_b32 s4, s23, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s59, 16
-; SI-NEXT:    buffer_store_dword v12, v11, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 24, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v11, s4
-; SI-NEXT:    s_and_b32 s4, s24, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s23, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s37, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 24, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 28, v0
-; SI-NEXT:    v_or_b32_e32 v9, s4, v9
-; SI-NEXT:    s_and_b32 s4, s25, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s58, 16
-; SI-NEXT:    buffer_store_dword v11, v10, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 32, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v10, s4
-; SI-NEXT:    s_and_b32 s4, s26, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s24, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s72, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 36, v0
-; SI-NEXT:    v_or_b32_e32 v8, s4, v8
-; SI-NEXT:    s_and_b32 s4, s27, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s57, 16
-; SI-NEXT:    buffer_store_dword v10, v9, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 40, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v8, v9, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v9, s4
-; SI-NEXT:    s_and_b32 s4, s28, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s25, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s36, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 44, v0
-; SI-NEXT:    v_or_b32_e32 v7, s4, v7
-; SI-NEXT:    s_and_b32 s4, s29, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s56, 16
-; SI-NEXT:    buffer_store_dword v9, v8, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v8, s4
-; SI-NEXT:    s_and_b32 s4, s41, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s26, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s62, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 36, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 52, v0
-; SI-NEXT:    v_or_b32_e32 v6, s4, v6
-; SI-NEXT:    s_and_b32 s4, s40, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s47, 16
-; SI-NEXT:    buffer_store_dword v8, v7, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 56, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    s_and_b32 s4, s15, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s27, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s35, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 40, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 60, v0
-; SI-NEXT:    v_or_b32_e32 v5, s4, v5
-; SI-NEXT:    s_and_b32 s4, s14, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s46, 16
-; SI-NEXT:    buffer_store_dword v7, v6, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 64, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v6, s4
-; SI-NEXT:    s_and_b32 s4, s13, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s28, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s60, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 44, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x44, v0
-; SI-NEXT:    v_or_b32_e32 v4, s4, v4
-; SI-NEXT:    s_and_b32 s4, s12, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s45, 16
-; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v5, s4
-; SI-NEXT:    s_and_b32 s4, s11, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s29, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s34, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 48, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x4c, v0
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s10, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s44, 16
-; SI-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x50, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v4, s4
-; SI-NEXT:    s_and_b32 s4, s9, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s14, s14, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s58, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 52, v0
+; SI-NEXT:    s_or_b32 s14, s14, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x54, v0
-; SI-NEXT:    v_or_b32_e32 v2, s4, v2
-; SI-NEXT:    s_and_b32 s4, s8, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s43, 16
-; SI-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x58, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v3, s4
-; SI-NEXT:    s_and_b32 s4, s7, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_mov_b32_e32 v2, s14
+; SI-NEXT:    s_and_b32 s14, s15, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s31, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 56, v0
+; SI-NEXT:    s_or_b32 s14, s14, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
-; SI-NEXT:    v_or_b32_e32 v1, s4, v1
-; SI-NEXT:    s_and_b32 s4, s6, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s42, 16
-; SI-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s14
+; SI-NEXT:    s_and_b32 s12, s12, 0xffff
+; SI-NEXT:    s_lshl_b32 s14, s56, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 60, v0
+; SI-NEXT:    s_or_b32 s12, s12, s14
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    s_and_b32 s12, s13, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s30, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 64, v0
+; SI-NEXT:    s_or_b32 s12, s12, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_lshl_b32 s12, s46, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x44, v0
+; SI-NEXT:    s_or_b32 s10, s10, s12
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s10, s11, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s95, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x48, v0
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    s_lshl_b32 s10, s44, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x4c, v0
+; SI-NEXT:    s_or_b32 s8, s8, s10
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s8, s9, 0xffff
+; SI-NEXT:    s_lshl_b32 s9, s94, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x50, v0
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_lshl_b32 s8, s42, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x54, v0
+; SI-NEXT:    s_or_b32 s6, s6, s8
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s6, s7, 0xffff
+; SI-NEXT:    s_lshl_b32 s7, s93, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x58, v0
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; SI-NEXT:    s_lshl_b32 s6, s40, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x5c, v0
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    s_and_b32 s4, s5, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s92, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x60, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    v_readlane_b32 s48, v14, 8
+; SI-NEXT:    v_readlane_b32 s39, v14, 7
+; SI-NEXT:    v_readlane_b32 s38, v14, 6
+; SI-NEXT:    v_readlane_b32 s37, v14, 5
+; SI-NEXT:    v_readlane_b32 s36, v14, 4
+; SI-NEXT:    v_readlane_b32 s35, v14, 3
+; SI-NEXT:    v_readlane_b32 s34, v14, 2
+; SI-NEXT:    v_readlane_b32 s31, v14, 1
+; SI-NEXT:    v_readlane_b32 s30, v14, 0
+; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB41_4:
-; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    ; implicit-def: $sgpr48
+; SI-NEXT:    ; implicit-def: $sgpr78
+; SI-NEXT:    ; implicit-def: $sgpr39
+; SI-NEXT:    ; implicit-def: $sgpr76
+; SI-NEXT:    ; implicit-def: $sgpr38
+; SI-NEXT:    ; implicit-def: $sgpr74
+; SI-NEXT:    ; implicit-def: $sgpr37
+; SI-NEXT:    ; implicit-def: $sgpr72
+; SI-NEXT:    ; implicit-def: $sgpr36
 ; SI-NEXT:    ; implicit-def: $sgpr62
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $sgpr61
-; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $sgpr35
 ; SI-NEXT:    ; implicit-def: $sgpr60
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $sgpr59
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $sgpr34
 ; SI-NEXT:    ; implicit-def: $sgpr58
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $sgpr57
-; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $sgpr31
 ; SI-NEXT:    ; implicit-def: $sgpr56
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $sgpr47
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $sgpr30
 ; SI-NEXT:    ; implicit-def: $sgpr46
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $sgpr45
-; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $sgpr95
 ; SI-NEXT:    ; implicit-def: $sgpr44
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $sgpr43
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr94
+; SI-NEXT:    ; implicit-def: $sgpr93
+; SI-NEXT:    ; implicit-def: $sgpr92
 ; SI-NEXT:    ; implicit-def: $sgpr42
+; SI-NEXT:    ; implicit-def: $sgpr40
 ; SI-NEXT:    s_branch .LBB41_2
 ;
 ; VI-LABEL: bitcast_v13i64_to_v52i16_scalar:
@@ -31660,171 +31709,177 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_mov_b32_e32 v13, s28
 ; SI-NEXT:    v_mov_b32_e32 v14, s29
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB49_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_alignbit_b32 v27, v12, v11, 16
-; SI-NEXT:    v_alignbit_b32 v28, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v29, v8, v7, 16
-; SI-NEXT:    v_alignbit_b32 v30, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v31, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v32, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v34, v14, v13, 16
-; SI-NEXT:    v_alignbit_b32 v37, v16, v15, 16
-; SI-NEXT:    v_alignbit_b32 v39, v18, v17, 16
-; SI-NEXT:    v_alignbit_b32 v49, v22, v21, 16
-; SI-NEXT:    v_alignbit_b32 v52, v20, v19, 16
-; SI-NEXT:    v_alignbit_b32 v54, v24, v23, 16
+; SI-NEXT:    v_lshr_b64 v[27:28], v[11:12], 16
+; SI-NEXT:    v_lshr_b64 v[28:29], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[29:30], v[7:8], 16
+; SI-NEXT:    v_lshr_b64 v[30:31], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[31:32], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[32:33], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[36:37], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[33:34], v[15:16], 16
+; SI-NEXT:    v_lshr_b64 v[37:38], v[21:22], 16
+; SI-NEXT:    v_lshr_b64 v[48:49], v[23:24], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v2
+; SI-NEXT:    s_waitcnt expcnt(6)
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v14
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v16
 ; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_alignbit_b32 v40, v26, v25, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v18
 ; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v22
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v20
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v24
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v26
+; SI-NEXT:    v_lshr_b64 v[34:35], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[38:39], v[19:20], 16
+; SI-NEXT:    v_lshr_b64 v[49:50], v[25:26], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB49_3
 ; SI-NEXT:  .LBB49_2: ; %cmp.true
-; SI-NEXT:    v_add_f64 v[25:26], v[25:26], 1.0
-; SI-NEXT:    v_add_f64 v[23:24], v[23:24], 1.0
-; SI-NEXT:    v_add_f64 v[19:20], v[19:20], 1.0
-; SI-NEXT:    v_add_f64 v[21:22], v[21:22], 1.0
-; SI-NEXT:    v_add_f64 v[17:18], v[17:18], 1.0
-; SI-NEXT:    v_add_f64 v[15:16], v[15:16], 1.0
-; SI-NEXT:    v_add_f64 v[13:14], v[13:14], 1.0
-; SI-NEXT:    v_add_f64 v[1:2], v[1:2], 1.0
-; SI-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
-; SI-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
-; SI-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
 ; SI-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
 ; SI-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
-; SI-NEXT:    v_alignbit_b32 v27, v12, v11, 16
-; SI-NEXT:    v_alignbit_b32 v28, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v29, v8, v7, 16
-; SI-NEXT:    v_alignbit_b32 v30, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v31, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v32, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v34, v14, v13, 16
-; SI-NEXT:    v_alignbit_b32 v37, v16, v15, 16
-; SI-NEXT:    v_alignbit_b32 v39, v18, v17, 16
-; SI-NEXT:    v_alignbit_b32 v49, v22, v21, 16
-; SI-NEXT:    v_alignbit_b32 v52, v20, v19, 16
-; SI-NEXT:    v_alignbit_b32 v54, v24, v23, 16
+; SI-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
+; SI-NEXT:    v_lshr_b64 v[27:28], v[11:12], 16
+; SI-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
+; SI-NEXT:    v_lshr_b64 v[28:29], v[9:10], 16
+; SI-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
+; SI-NEXT:    v_lshr_b64 v[29:30], v[7:8], 16
+; SI-NEXT:    v_add_f64 v[1:2], v[1:2], 1.0
+; SI-NEXT:    v_add_f64 v[13:14], v[13:14], 1.0
+; SI-NEXT:    v_lshr_b64 v[30:31], v[5:6], 16
+; SI-NEXT:    v_add_f64 v[15:16], v[15:16], 1.0
+; SI-NEXT:    v_add_f64 v[21:22], v[21:22], 1.0
+; SI-NEXT:    v_add_f64 v[23:24], v[23:24], 1.0
+; SI-NEXT:    v_lshr_b64 v[31:32], v[3:4], 16
+; SI-NEXT:    v_add_f64 v[17:18], v[17:18], 1.0
+; SI-NEXT:    v_add_f64 v[19:20], v[19:20], 1.0
+; SI-NEXT:    v_add_f64 v[25:26], v[25:26], 1.0
+; SI-NEXT:    v_lshr_b64 v[32:33], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[36:37], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[33:34], v[15:16], 16
+; SI-NEXT:    v_lshr_b64 v[37:38], v[21:22], 16
+; SI-NEXT:    v_lshr_b64 v[48:49], v[23:24], 16
+; SI-NEXT:    v_lshr_b64 v[34:35], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[38:39], v[19:20], 16
+; SI-NEXT:    v_lshr_b64 v[49:50], v[25:26], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v2
+; SI-NEXT:    s_waitcnt expcnt(6)
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v14
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v16
 ; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_alignbit_b32 v40, v26, v25, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v18
+; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v18
 ; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v22
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v20
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v24
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v26
 ; SI-NEXT:  .LBB49_3: ; %end
+; SI-NEXT:    v_lshlrev_b32_e32 v35, 16, v49
 ; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v40
-; SI-NEXT:    v_or_b32_e32 v25, v25, v40
+; SI-NEXT:    v_or_b32_e32 v25, v25, v35
 ; SI-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v44
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v47
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v26
 ; SI-NEXT:    v_add_i32_e32 v26, vcc, 4, v0
 ; SI-NEXT:    buffer_store_dword v25, v26, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v54
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v48
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v25
 ; SI-NEXT:    v_add_i32_e32 v25, vcc, 8, v0
 ; SI-NEXT:    buffer_store_dword v23, v25, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v43
+; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v46
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v24
 ; SI-NEXT:    v_add_i32_e32 v24, vcc, 12, v0
 ; SI-NEXT:    buffer_store_dword v23, v24, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v52
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v38
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v23
 ; SI-NEXT:    v_add_i32_e32 v23, vcc, 16, v0
 ; SI-NEXT:    buffer_store_dword v19, v23, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v42
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v45
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v20
 ; SI-NEXT:    v_add_i32_e32 v20, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v21
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v49
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v37
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v20
 ; SI-NEXT:    v_add_i32_e32 v20, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v41
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v44
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v20
 ; SI-NEXT:    v_add_i32_e32 v20, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v39
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v34
 ; SI-NEXT:    v_or_b32_e32 v17, v17, v19
 ; SI-NEXT:    v_add_i32_e32 v19, vcc, 32, v0
 ; SI-NEXT:    buffer_store_dword v17, v19, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v55
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v43
 ; SI-NEXT:    v_or_b32_e32 v17, v17, v18
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v37
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v33
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v17
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v15, v17, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v53
+; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v42
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v16
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v13
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v36
 ; SI-NEXT:    v_or_b32_e32 v13, v13, v15
 ; SI-NEXT:    v_add_i32_e32 v15, vcc, 48, v0
 ; SI-NEXT:    buffer_store_dword v13, v15, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v51
+; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v41
 ; SI-NEXT:    v_or_b32_e32 v13, v13, v14
 ; SI-NEXT:    v_add_i32_e32 v14, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
@@ -31836,7 +31891,7 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v13, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v50
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -31848,7 +31903,7 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v48
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v55
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -31860,7 +31915,7 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v38
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v54
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -31872,7 +31927,7 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v36
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v53
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -31884,7 +31939,7 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v35
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v52
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -31896,44 +31951,47 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v33
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v51
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB49_4:
-; SI-NEXT:    ; implicit-def: $vgpr40
+; SI-NEXT:    ; implicit-def: $vgpr49
+; SI-NEXT:    ; implicit-def: $vgpr47
+; SI-NEXT:    ; implicit-def: $vgpr48
+; SI-NEXT:    ; implicit-def: $vgpr46
+; SI-NEXT:    ; implicit-def: $vgpr38
+; SI-NEXT:    ; implicit-def: $vgpr45
+; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; implicit-def: $vgpr44
-; SI-NEXT:    ; implicit-def: $vgpr54
+; SI-NEXT:    ; implicit-def: $vgpr34
 ; SI-NEXT:    ; implicit-def: $vgpr43
-; SI-NEXT:    ; implicit-def: $vgpr52
+; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    ; implicit-def: $vgpr42
-; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; implicit-def: $vgpr41
-; SI-NEXT:    ; implicit-def: $vgpr39
+; SI-NEXT:    ; implicit-def: $vgpr40
 ; SI-NEXT:    ; implicit-def: $vgpr55
-; SI-NEXT:    ; implicit-def: $vgpr37
+; SI-NEXT:    ; implicit-def: $vgpr54
 ; SI-NEXT:    ; implicit-def: $vgpr53
-; SI-NEXT:    ; implicit-def: $vgpr34
+; SI-NEXT:    ; implicit-def: $vgpr52
 ; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    ; implicit-def: $vgpr38
 ; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr28
-; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr33
 ; SI-NEXT:    s_branch .LBB49_2
 ;
 ; VI-LABEL: bitcast_v13f64_to_v52i16_scalar:
@@ -43234,482 +43292,533 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; SI-LABEL: bitcast_v52f16_to_v52i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4
-; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:24
 ; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:16
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v3
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:28
+; SI-NEXT:    v_cvt_f16_f32_e32 v62, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v59, v10
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v38, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v41, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v40, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v55, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v31, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v20
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, v23
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v25
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v26
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, v27
-; SI-NEXT:    v_cvt_f16_f32_e32 v51, v28
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v29
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v30
-; SI-NEXT:    v_cvt_f16_f32_e32 v27, s16
-; SI-NEXT:    v_cvt_f16_f32_e32 v24, s18
-; SI-NEXT:    v_cvt_f16_f32_e32 v23, s19
-; SI-NEXT:    v_cvt_f16_f32_e32 v26, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, s22
-; SI-NEXT:    v_cvt_f16_f32_e32 v22, s23
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, s24
-; SI-NEXT:    v_cvt_f16_f32_e32 v19, s26
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, s27
-; SI-NEXT:    v_cvt_f16_f32_e32 v20, s28
-; SI-NEXT:    v_cvt_f16_f32_e32 v30, s29
+; SI-NEXT:    v_cvt_f16_f32_e32 v63, v14
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v47, v18
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v44, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v22
+; SI-NEXT:    v_cvt_f16_f32_e32 v42, v23
+; SI-NEXT:    v_cvt_f16_f32_e32 v51, v26
+; SI-NEXT:    v_cvt_f16_f32_e32 v40, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v57, v30
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, s16
+; SI-NEXT:    v_cvt_f16_f32_e32 v46, s19
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, s20
+; SI-NEXT:    v_cvt_f16_f32_e32 v45, s23
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, s24
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, s27
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, s28
+; SI-NEXT:    v_cvt_f16_f32_e32 v26, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v22, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v48, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v37, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v25
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v29
+; SI-NEXT:    v_cvt_f16_f32_e32 v43, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v41, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, s26
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, s29
+; SI-NEXT:    s_waitcnt vmcnt(10)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
+; SI-NEXT:    s_waitcnt vmcnt(9)
+; SI-NEXT:    v_cvt_f16_f32_e32 v53, v32
 ; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v42
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v33
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_cvt_f16_f32_e32 v29, v43
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, v34
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v44
+; SI-NEXT:    v_cvt_f16_f32_e32 v30, v35
+; SI-NEXT:    v_cvt_f16_f32_e32 v35, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v33, v24
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v28
 ; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v45
+; SI-NEXT:    v_cvt_f16_f32_e32 v55, v36
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v46
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v38
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cvt_f16_f32_e32 v28, v47
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v56
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v57
-; SI-NEXT:    v_cvt_f16_f32_e32 v44, s17
-; SI-NEXT:    v_cvt_f16_f32_e32 v43, s21
-; SI-NEXT:    v_cvt_f16_f32_e32 v42, s25
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v58
+; SI-NEXT:    v_cvt_f16_f32_e32 v27, v39
+; SI-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v49
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, s22
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, s25
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_cbranch_scc0 .LBB59_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_cbranch_execnz .LBB59_3
 ; SI-NEXT:  .LBB59_2: ; %cmp.true
-; SI-NEXT:    v_cvt_f32_f16_e32 v44, v44
-; SI-NEXT:    v_cvt_f32_f16_e32 v43, v43
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v42
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v41
-; SI-NEXT:    v_add_f32_e32 v44, 0x38000000, v44
-; SI-NEXT:    v_cvt_f16_f32_e32 v44, v44
-; SI-NEXT:    v_add_f32_e32 v43, 0x38000000, v43
-; SI-NEXT:    v_add_f32_e32 v42, 0x38000000, v42
-; SI-NEXT:    v_cvt_f16_f32_e32 v45, v43
-; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v44
-; SI-NEXT:    v_cvt_f16_f32_e32 v42, v42
-; SI-NEXT:    v_cvt_f32_f16_e32 v44, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v40, v40
-; SI-NEXT:    v_add_f32_e32 v41, 0x38000000, v41
-; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v42
-; SI-NEXT:    v_add_f32_e32 v42, 0x38000000, v44
-; SI-NEXT:    v_cvt_f16_f32_e32 v42, v42
-; SI-NEXT:    v_add_f32_e32 v40, 0x38000000, v40
-; SI-NEXT:    v_cvt_f16_f32_e32 v44, v41
-; SI-NEXT:    v_cvt_f16_f32_e32 v40, v40
-; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v42
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v55
-; SI-NEXT:    v_cvt_f32_f16_e32 v54, v54
-; SI-NEXT:    v_lshlrev_b32_e32 v55, 16, v40
-; SI-NEXT:    v_cvt_f32_f16_e32 v53, v53
-; SI-NEXT:    v_add_f32_e32 v40, 0x38000000, v42
-; SI-NEXT:    v_cvt_f16_f32_e32 v40, v40
-; SI-NEXT:    v_add_f32_e32 v54, 0x38000000, v54
-; SI-NEXT:    v_add_f32_e32 v53, 0x38000000, v53
-; SI-NEXT:    v_cvt_f16_f32_e32 v42, v54
-; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v40
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v53
-; SI-NEXT:    v_cvt_f32_f16_e32 v40, v52
-; SI-NEXT:    v_cvt_f32_f16_e32 v51, v51
-; SI-NEXT:    v_cvt_f32_f16_e32 v29, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v52, 16, v53
-; SI-NEXT:    v_add_f32_e32 v53, 0x38000000, v40
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v53
-; SI-NEXT:    v_add_f32_e32 v51, 0x38000000, v51
-; SI-NEXT:    v_add_f32_e32 v29, 0x38000000, v29
-; SI-NEXT:    v_cvt_f16_f32_e32 v40, v51
-; SI-NEXT:    v_lshlrev_b32_e32 v51, 16, v53
-; SI-NEXT:    v_cvt_f16_f32_e32 v29, v29
-; SI-NEXT:    v_cvt_f32_f16_e32 v53, v28
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v43
+; SI-NEXT:    v_mov_b32_e32 v38, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v41
+; SI-NEXT:    v_mov_b32_e32 v28, v7
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v54
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_mov_b32_e32 v36, v19
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v39
+; SI-NEXT:    v_mov_b32_e32 v9, v15
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v1
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v3
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v36
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
-; SI-NEXT:    v_add_f32_e32 v29, 0x38000000, v53
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_or_b32_e32 v39, v7, v19
+; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v38
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v38, v7, v15
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v52
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v50
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v29, v29
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v48
+; SI-NEXT:    v_mov_b32_e32 v29, v11
+; SI-NEXT:    v_or_b32_e32 v5, v5, v23
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v29
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v37
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
 ; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_or_b32_e32 v5, v5, v25
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v33
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v35
+; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
-; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
 ; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
-; SI-NEXT:    v_or_b32_e32 v5, v5, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT:    v_or_b32_e32 v3, v3, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v8
-; SI-NEXT:    v_or_b32_e32 v9, v9, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v7
-; SI-NEXT:    v_or_b32_e32 v14, v14, v29
-; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v10
-; SI-NEXT:    v_or_b32_e32 v12, v12, v29
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; SI-NEXT:    v_cvt_f32_f16_e32 v29, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v36, v36
-; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
-; SI-NEXT:    v_add_f32_e32 v29, 0x38000000, v29
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v31, v29
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; SI-NEXT:    v_add_f32_e32 v36, 0x38000000, v36
-; SI-NEXT:    v_cvt_f32_f16_e32 v34, v34
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v36
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v16
-; SI-NEXT:    v_or_b32_e32 v17, v17, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v31
-; SI-NEXT:    v_add_f32_e32 v32, 0x38000000, v32
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
-; SI-NEXT:    v_add_f32_e32 v34, 0x38000000, v34
-; SI-NEXT:    v_or_b32_e32 v36, v36, v29
-; SI-NEXT:    v_cvt_f32_f16_e32 v29, v39
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v34
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v48
-; SI-NEXT:    v_cvt_f32_f16_e32 v38, v38
-; SI-NEXT:    v_cvt_f32_f16_e32 v50, v50
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
 ; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
 ; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v32
-; SI-NEXT:    v_add_f32_e32 v29, 0x38000000, v29
-; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
-; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
-; SI-NEXT:    v_or_b32_e32 v34, v34, v39
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v29
-; SI-NEXT:    v_add_f32_e32 v29, 0x38000000, v48
-; SI-NEXT:    v_add_f32_e32 v38, 0x38000000, v38
-; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
-; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
-; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
-; SI-NEXT:    v_cvt_f32_f16_e32 v49, v49
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v37
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v35
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, v33
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v29, v29
-; SI-NEXT:    v_cvt_f16_f32_e32 v38, v38
-; SI-NEXT:    v_add_f32_e32 v50, 0x38000000, v50
+; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
 ; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
 ; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
+; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
 ; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
-; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v25
-; SI-NEXT:    v_add_f32_e32 v23, 0x38000000, v23
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v39
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
-; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v24
-; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
 ; SI-NEXT:    v_add_f32_e32 v26, 0x38000000, v26
-; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
-; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
-; SI-NEXT:    v_add_f32_e32 v49, 0x38000000, v49
-; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
-; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v35
-; SI-NEXT:    v_add_f32_e32 v33, 0x38000000, v33
-; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v15
-; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
-; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
-; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
-; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT:    v_or_b32_e32 v48, v29, v48
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v38
-; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
 ; SI-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, v49
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v33
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-NEXT:    v_or_b32_e32 v50, v50, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v18
-; SI-NEXT:    v_or_b32_e32 v19, v19, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v22
-; SI-NEXT:    v_or_b32_e32 v25, v25, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v23
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v45
-; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v44
-; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v42
-; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v40
-; SI-NEXT:    v_or_b32_e32 v24, v24, v29
-; SI-NEXT:    v_or_b32_e32 v27, v27, v43
-; SI-NEXT:    v_or_b32_e32 v26, v26, v45
-; SI-NEXT:    v_or_b32_e32 v21, v21, v30
-; SI-NEXT:    v_or_b32_e32 v20, v20, v41
-; SI-NEXT:    v_or_b32_e32 v49, v49, v46
-; SI-NEXT:    v_or_b32_e32 v37, v37, v55
-; SI-NEXT:    v_or_b32_e32 v35, v35, v54
-; SI-NEXT:    v_or_b32_e32 v33, v33, v47
-; SI-NEXT:    v_or_b32_e32 v15, v15, v52
-; SI-NEXT:    v_or_b32_e32 v13, v13, v51
-; SI-NEXT:    v_or_b32_e32 v11, v11, v56
-; SI-NEXT:    v_or_b32_e32 v6, v6, v28
-; SI-NEXT:    v_or_b32_e32 v4, v4, v57
-; SI-NEXT:    v_alignbit_b32 v44, v24, v43, 16
-; SI-NEXT:    v_alignbit_b32 v43, v25, v45, 16
-; SI-NEXT:    v_alignbit_b32 v42, v19, v30, 16
-; SI-NEXT:    v_alignbit_b32 v30, v50, v41, 16
-; SI-NEXT:    v_alignbit_b32 v41, v48, v46, 16
-; SI-NEXT:    v_alignbit_b32 v40, v34, v55, 16
-; SI-NEXT:    v_alignbit_b32 v55, v36, v54, 16
-; SI-NEXT:    v_alignbit_b32 v54, v17, v47, 16
-; SI-NEXT:    v_alignbit_b32 v53, v12, v52, 16
-; SI-NEXT:    v_alignbit_b32 v52, v14, v51, 16
-; SI-NEXT:    v_alignbit_b32 v51, v9, v56, 16
-; SI-NEXT:    v_alignbit_b32 v29, v3, v28, 16
-; SI-NEXT:    v_alignbit_b32 v28, v5, v57, 16
+; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v24
+; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v28
+; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_or_b32_e32 v7, v7, v17
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v44
+; SI-NEXT:    v_or_b32_e32 v5, v5, v21
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v56
+; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v28, v9
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v42
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_or_b32_e32 v5, v5, v11
+; SI-NEXT:    v_or_b32_e32 v56, v7, v13
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v31
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v40
+; SI-NEXT:    v_or_b32_e32 v36, v1, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v55
+; SI-NEXT:    v_or_b32_e32 v37, v28, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v53
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_or_b32_e32 v35, v3, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v34
+; SI-NEXT:    v_or_b32_e32 v33, v28, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v30
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v29, v1
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v28
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v32
+; SI-NEXT:    v_cvt_f16_f32_e32 v30, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v27
+; SI-NEXT:    v_or_b32_e32 v31, v29, v1
+; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v29, v28
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v30
+; SI-NEXT:    v_or_b32_e32 v2, v2, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v27, v57
+; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v29
+; SI-NEXT:    v_or_b32_e32 v4, v4, v28
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v51
+; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v57, v27
+; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v28
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v60
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v57
+; SI-NEXT:    v_or_b32_e32 v6, v6, v27
+; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v28
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v32
+; SI-NEXT:    v_or_b32_e32 v8, v8, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v27, v47
+; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v60
+; SI-NEXT:    v_or_b32_e32 v10, v10, v28
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v63
+; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v47, v27
+; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v28
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v59
+; SI-NEXT:    v_cvt_f16_f32_e32 v63, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v47
+; SI-NEXT:    v_or_b32_e32 v12, v12, v27
+; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v59, v28
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v63
+; SI-NEXT:    v_or_b32_e32 v14, v14, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v27, v62
+; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v59
+; SI-NEXT:    v_or_b32_e32 v18, v18, v28
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v58
+; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v62, v27
+; SI-NEXT:    v_lshr_b64 v[50:51], v[17:18], 16
+; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v62
+; SI-NEXT:    v_or_b32_e32 v22, v22, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v61
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v58
+; SI-NEXT:    v_or_b32_e32 v26, v26, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v27, v45
+; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, v28
+; SI-NEXT:    v_lshr_b64 v[54:55], v[25:26], 16
+; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v45, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v27, v46
+; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v61
+; SI-NEXT:    v_lshr_b64 v[52:53], v[21:22], 16
+; SI-NEXT:    v_or_b32_e32 v16, v16, v28
+; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v46, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v45
+; SI-NEXT:    v_or_b32_e32 v20, v20, v27
+; SI-NEXT:    v_mov_b32_e32 v53, v33
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v46
+; SI-NEXT:    v_or_b32_e32 v24, v24, v27
+; SI-NEXT:    v_lshr_b64 v[43:44], v[23:24], 16
+; SI-NEXT:    v_lshr_b64 v[33:34], v[7:8], 16
+; SI-NEXT:    v_mov_b32_e32 v7, v56
+; SI-NEXT:    v_lshr_b64 v[55:56], v[3:4], 16
+; SI-NEXT:    v_mov_b32_e32 v44, v37
+; SI-NEXT:    v_lshr_b64 v[41:42], v[19:20], 16
+; SI-NEXT:    v_mov_b32_e32 v19, v39
+; SI-NEXT:    v_lshr_b64 v[39:40], v[15:16], 16
+; SI-NEXT:    v_mov_b32_e32 v15, v38
+; SI-NEXT:    v_lshr_b64 v[37:38], v[11:12], 16
+; SI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v42, v36
+; SI-NEXT:    v_mov_b32_e32 v40, v35
+; SI-NEXT:    v_mov_b32_e32 v51, v32
+; SI-NEXT:    v_lshr_b64 v[48:49], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[35:36], v[9:10], 16
+; SI-NEXT:    v_mov_b32_e32 v34, v31
+; SI-NEXT:    v_lshr_b64 v[31:32], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[27:28], v[1:2], 16
+; SI-NEXT:    v_mov_b32_e32 v32, v29
 ; SI-NEXT:  .LBB59_3: ; %end
-; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v44
-; SI-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; SI-NEXT:    v_or_b32_e32 v27, v27, v44
-; SI-NEXT:    v_or_b32_e32 v23, v24, v23
-; SI-NEXT:    v_add_i32_e32 v24, vcc, 4, v0
-; SI-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v23, v24, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v43
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v43
-; SI-NEXT:    v_or_b32_e32 v23, v23, v24
-; SI-NEXT:    v_add_i32_e32 v24, vcc, 8, v0
-; SI-NEXT:    buffer_store_dword v23, v24, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v24
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v46
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 4, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v25
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; SI-NEXT:    v_or_b32_e32 v22, v23, v22
-; SI-NEXT:    v_add_i32_e32 v23, vcc, 12, v0
-; SI-NEXT:    buffer_store_dword v22, v23, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v21
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v41
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v19
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 8, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v42
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; SI-NEXT:    v_or_b32_e32 v21, v21, v22
-; SI-NEXT:    v_add_i32_e32 v22, vcc, 16, v0
-; SI-NEXT:    v_or_b32_e32 v18, v19, v18
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 20, v0
-; SI-NEXT:    buffer_store_dword v21, v22, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v30
-; SI-NEXT:    v_or_b32_e32 v18, v18, v19
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 24, v0
-; SI-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v50
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v38
-; SI-NEXT:    v_or_b32_e32 v18, v18, v19
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 28, v0
-; SI-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v49
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v41
-; SI-NEXT:    v_or_b32_e32 v18, v18, v19
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 32, v0
-; SI-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v20
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v45
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v48
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v39
-; SI-NEXT:    v_or_b32_e32 v18, v18, v19
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 36, v0
-; SI-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v39
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v15
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v40
-; SI-NEXT:    v_or_b32_e32 v18, v18, v19
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 40, v0
-; SI-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v61
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v34
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v32
-; SI-NEXT:    v_or_b32_e32 v18, v18, v19
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 44, v0
-; SI-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v35
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v55
-; SI-NEXT:    v_or_b32_e32 v18, v18, v19
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 48, v0
-; SI-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v36
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v31
-; SI-NEXT:    v_or_b32_e32 v18, v18, v19
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 52, v0
-; SI-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v33
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v54
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; SI-NEXT:    v_or_b32_e32 v18, v18, v19
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 56, v0
-; SI-NEXT:    v_or_b32_e32 v16, v17, v16
-; SI-NEXT:    v_add_i32_e32 v17, vcc, 60, v0
-; SI-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v54
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v11
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v53
-; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; SI-NEXT:    v_or_b32_e32 v15, v15, v16
-; SI-NEXT:    v_add_i32_e32 v16, vcc, 64, v0
-; SI-NEXT:    v_or_b32_e32 v10, v12, v10
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 0x44, v0
-; SI-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v10, v12, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v26
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v58
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v52
-; SI-NEXT:    v_or_b32_e32 v10, v10, v12
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 0x48, v0
-; SI-NEXT:    buffer_store_dword v10, v12, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v52
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; SI-NEXT:    v_or_b32_e32 v7, v10, v7
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 0x4c, v0
-; SI-NEXT:    buffer_store_dword v7, v10, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v62
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 36, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v51
-; SI-NEXT:    v_or_b32_e32 v7, v7, v10
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 0x50, v0
-; SI-NEXT:    buffer_store_dword v7, v10, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v50
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 40, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; SI-NEXT:    v_or_b32_e32 v7, v7, v8
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x54, v0
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v59
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 44, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v29
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v6, v6, v7
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x58, v0
-; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v7
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v48
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v63
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v56
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v37
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v12
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v47
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v44
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v35
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 64, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v60
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x44, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v42
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v33
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x48, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v51
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v40
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v31
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x50, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v57
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x54, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v53
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v55
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x58, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v32
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x5c, v0
-; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v28
-; SI-NEXT:    v_or_b32_e32 v2, v2, v3
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v27
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x60, v0
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v5
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v30
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB59_4:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
index 099642646f8cc..8ee5b966f40b8 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
@@ -3637,337 +3637,368 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3
 ; SI-LABEL: bitcast_v28i32_to_v56i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_writelane_b32 v16, s30, 0
+; SI-NEXT:    v_writelane_b32 v16, s31, 1
+; SI-NEXT:    v_writelane_b32 v16, s34, 2
+; SI-NEXT:    v_writelane_b32 v16, s35, 3
+; SI-NEXT:    v_writelane_b32 v16, s36, 4
+; SI-NEXT:    v_writelane_b32 v16, s37, 5
+; SI-NEXT:    v_writelane_b32 v16, s38, 6
+; SI-NEXT:    v_writelane_b32 v16, s39, 7
+; SI-NEXT:    v_writelane_b32 v16, s48, 8
+; SI-NEXT:    v_writelane_b32 v16, s49, 9
+; SI-NEXT:    v_writelane_b32 v16, s50, 10
+; SI-NEXT:    v_writelane_b32 v16, s51, 11
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
-; SI-NEXT:    v_readfirstlane_b32 s43, v1
-; SI-NEXT:    v_readfirstlane_b32 s42, v2
-; SI-NEXT:    v_readfirstlane_b32 s41, v3
-; SI-NEXT:    v_readfirstlane_b32 s40, v4
-; SI-NEXT:    v_readfirstlane_b32 s15, v5
-; SI-NEXT:    v_readfirstlane_b32 s14, v6
-; SI-NEXT:    v_readfirstlane_b32 s13, v7
-; SI-NEXT:    v_readfirstlane_b32 s12, v8
-; SI-NEXT:    v_readfirstlane_b32 s11, v9
-; SI-NEXT:    v_readfirstlane_b32 s10, v10
-; SI-NEXT:    v_readfirstlane_b32 s9, v11
-; SI-NEXT:    v_readfirstlane_b32 s8, v12
-; SI-NEXT:    v_readfirstlane_b32 s7, v13
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_readfirstlane_b32 s6, v14
+; SI-NEXT:    v_writelane_b32 v16, s52, 12
+; SI-NEXT:    v_readfirstlane_b32 s40, v1
+; SI-NEXT:    v_readfirstlane_b32 s41, v2
+; SI-NEXT:    v_readfirstlane_b32 s14, v3
+; SI-NEXT:    v_readfirstlane_b32 s15, v4
+; SI-NEXT:    v_readfirstlane_b32 s12, v5
+; SI-NEXT:    v_readfirstlane_b32 s13, v6
+; SI-NEXT:    v_readfirstlane_b32 s10, v7
+; SI-NEXT:    v_readfirstlane_b32 s11, v8
+; SI-NEXT:    v_readfirstlane_b32 s8, v9
+; SI-NEXT:    v_readfirstlane_b32 s9, v10
+; SI-NEXT:    v_readfirstlane_b32 s6, v11
+; SI-NEXT:    v_readfirstlane_b32 s7, v12
+; SI-NEXT:    v_readfirstlane_b32 s4, v13
+; SI-NEXT:    s_and_b64 s[42:43], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s5, v14
+; SI-NEXT:    v_writelane_b32 v16, s53, 13
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s13
-; SI-NEXT:    v_mov_b32_e32 v5, s15
-; SI-NEXT:    v_mov_b32_e32 v6, s41
-; SI-NEXT:    v_mov_b32_e32 v7, s43
-; SI-NEXT:    v_mov_b32_e32 v8, s28
-; SI-NEXT:    v_mov_b32_e32 v9, s26
-; SI-NEXT:    v_mov_b32_e32 v10, s24
-; SI-NEXT:    v_mov_b32_e32 v11, s22
-; SI-NEXT:    v_mov_b32_e32 v12, s20
-; SI-NEXT:    v_mov_b32_e32 v13, s18
-; SI-NEXT:    v_mov_b32_e32 v14, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s10, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s12, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s14, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s40, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s42, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s29, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s27, v9, 16
-; SI-NEXT:    v_alignbit_b32 v10, s25, v10, 16
-; SI-NEXT:    v_alignbit_b32 v11, s23, v11, 16
-; SI-NEXT:    v_alignbit_b32 v12, s21, v12, 16
-; SI-NEXT:    v_alignbit_b32 v13, s19, v13, 16
-; SI-NEXT:    v_alignbit_b32 v14, s17, v14, 16
-; SI-NEXT:    s_lshr_b32 s44, s6, 16
-; SI-NEXT:    s_lshr_b32 s45, s8, 16
-; SI-NEXT:    s_lshr_b32 s46, s10, 16
-; SI-NEXT:    s_lshr_b32 s47, s12, 16
-; SI-NEXT:    s_lshr_b32 s56, s14, 16
-; SI-NEXT:    s_lshr_b32 s57, s40, 16
-; SI-NEXT:    s_lshr_b32 s58, s42, 16
-; SI-NEXT:    s_lshr_b32 s59, s29, 16
-; SI-NEXT:    s_lshr_b32 s60, s27, 16
-; SI-NEXT:    s_lshr_b32 s61, s25, 16
-; SI-NEXT:    s_lshr_b32 s62, s23, 16
-; SI-NEXT:    s_lshr_b32 s63, s21, 16
-; SI-NEXT:    s_lshr_b32 s72, s19, 16
-; SI-NEXT:    s_lshr_b32 s73, s17, 16
+; SI-NEXT:    s_lshr_b32 s30, s5, 16
+; SI-NEXT:    s_lshr_b32 s31, s7, 16
+; SI-NEXT:    s_lshr_b32 s34, s9, 16
+; SI-NEXT:    s_lshr_b32 s35, s11, 16
+; SI-NEXT:    s_lshr_b32 s36, s13, 16
+; SI-NEXT:    s_lshr_b32 s37, s15, 16
+; SI-NEXT:    s_lshr_b32 s38, s41, 16
+; SI-NEXT:    s_lshr_b32 s39, s29, 16
+; SI-NEXT:    s_lshr_b32 s48, s27, 16
+; SI-NEXT:    s_lshr_b32 s49, s25, 16
+; SI-NEXT:    s_lshr_b32 s50, s23, 16
+; SI-NEXT:    s_lshr_b32 s51, s21, 16
+; SI-NEXT:    s_lshr_b32 s52, s19, 16
+; SI-NEXT:    s_lshr_b32 s53, s17, 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[14:15], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[40:41], 16
+; SI-NEXT:    s_lshr_b64 s[72:73], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[74:75], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[78:79], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[88:89], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[90:91], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[92:93], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB13_3
 ; SI-NEXT:  .LBB13_2: ; %cmp.true
+; SI-NEXT:    s_add_i32 s17, s17, 3
 ; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    s_add_i32 s19, s19, 3
 ; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_add_i32 s21, s21, 3
 ; SI-NEXT:    s_add_i32 s20, s20, 3
+; SI-NEXT:    s_add_i32 s23, s23, 3
 ; SI-NEXT:    s_add_i32 s22, s22, 3
+; SI-NEXT:    s_add_i32 s25, s25, 3
 ; SI-NEXT:    s_add_i32 s24, s24, 3
+; SI-NEXT:    s_add_i32 s27, s27, 3
 ; SI-NEXT:    s_add_i32 s26, s26, 3
+; SI-NEXT:    s_add_i32 s29, s29, 3
 ; SI-NEXT:    s_add_i32 s28, s28, 3
-; SI-NEXT:    s_add_i32 s43, s43, 3
 ; SI-NEXT:    s_add_i32 s41, s41, 3
-; SI-NEXT:    s_add_i32 s15, s15, 3
-; SI-NEXT:    s_add_i32 s13, s13, 3
-; SI-NEXT:    s_add_i32 s11, s11, 3
-; SI-NEXT:    s_add_i32 s9, s9, 3
-; SI-NEXT:    s_add_i32 s7, s7, 3
-; SI-NEXT:    s_add_i32 s17, s17, 3
-; SI-NEXT:    s_add_i32 s19, s19, 3
-; SI-NEXT:    s_add_i32 s21, s21, 3
-; SI-NEXT:    s_add_i32 s23, s23, 3
-; SI-NEXT:    s_add_i32 s25, s25, 3
-; SI-NEXT:    s_add_i32 s27, s27, 3
-; SI-NEXT:    s_add_i32 s29, s29, 3
-; SI-NEXT:    s_add_i32 s42, s42, 3
 ; SI-NEXT:    s_add_i32 s40, s40, 3
+; SI-NEXT:    s_add_i32 s15, s15, 3
 ; SI-NEXT:    s_add_i32 s14, s14, 3
+; SI-NEXT:    s_add_i32 s13, s13, 3
 ; SI-NEXT:    s_add_i32 s12, s12, 3
+; SI-NEXT:    s_add_i32 s11, s11, 3
 ; SI-NEXT:    s_add_i32 s10, s10, 3
+; SI-NEXT:    s_add_i32 s9, s9, 3
 ; SI-NEXT:    s_add_i32 s8, s8, 3
+; SI-NEXT:    s_add_i32 s7, s7, 3
 ; SI-NEXT:    s_add_i32 s6, s6, 3
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s13
-; SI-NEXT:    v_mov_b32_e32 v5, s15
-; SI-NEXT:    v_mov_b32_e32 v6, s41
-; SI-NEXT:    v_mov_b32_e32 v7, s43
-; SI-NEXT:    v_mov_b32_e32 v8, s28
-; SI-NEXT:    v_mov_b32_e32 v9, s26
-; SI-NEXT:    v_mov_b32_e32 v10, s24
-; SI-NEXT:    v_mov_b32_e32 v11, s22
-; SI-NEXT:    v_mov_b32_e32 v12, s20
-; SI-NEXT:    v_mov_b32_e32 v13, s18
-; SI-NEXT:    v_mov_b32_e32 v14, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s10, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s12, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s14, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s40, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s42, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s29, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s27, v9, 16
-; SI-NEXT:    v_alignbit_b32 v10, s25, v10, 16
-; SI-NEXT:    v_alignbit_b32 v11, s23, v11, 16
-; SI-NEXT:    v_alignbit_b32 v12, s21, v12, 16
-; SI-NEXT:    v_alignbit_b32 v13, s19, v13, 16
-; SI-NEXT:    v_alignbit_b32 v14, s17, v14, 16
-; SI-NEXT:    s_lshr_b32 s44, s6, 16
-; SI-NEXT:    s_lshr_b32 s45, s8, 16
-; SI-NEXT:    s_lshr_b32 s46, s10, 16
-; SI-NEXT:    s_lshr_b32 s47, s12, 16
-; SI-NEXT:    s_lshr_b32 s56, s14, 16
-; SI-NEXT:    s_lshr_b32 s57, s40, 16
-; SI-NEXT:    s_lshr_b32 s58, s42, 16
-; SI-NEXT:    s_lshr_b32 s59, s29, 16
-; SI-NEXT:    s_lshr_b32 s60, s27, 16
-; SI-NEXT:    s_lshr_b32 s61, s25, 16
-; SI-NEXT:    s_lshr_b32 s62, s23, 16
-; SI-NEXT:    s_lshr_b32 s63, s21, 16
-; SI-NEXT:    s_lshr_b32 s72, s19, 16
-; SI-NEXT:    s_lshr_b32 s73, s17, 16
+; SI-NEXT:    s_add_i32 s5, s5, 3
+; SI-NEXT:    s_add_i32 s4, s4, 3
+; SI-NEXT:    s_lshr_b64 s[42:43], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[14:15], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[40:41], 16
+; SI-NEXT:    s_lshr_b64 s[72:73], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[74:75], s[26:27], 16
+; SI-NEXT:    s_lshr_b32 s30, s5, 16
+; SI-NEXT:    s_lshr_b32 s31, s7, 16
+; SI-NEXT:    s_lshr_b32 s34, s9, 16
+; SI-NEXT:    s_lshr_b32 s35, s11, 16
+; SI-NEXT:    s_lshr_b32 s36, s13, 16
+; SI-NEXT:    s_lshr_b32 s37, s15, 16
+; SI-NEXT:    s_lshr_b32 s38, s41, 16
+; SI-NEXT:    s_lshr_b32 s39, s29, 16
+; SI-NEXT:    s_lshr_b32 s48, s27, 16
+; SI-NEXT:    s_lshr_b32 s49, s25, 16
+; SI-NEXT:    s_lshr_b32 s50, s23, 16
+; SI-NEXT:    s_lshr_b32 s51, s21, 16
+; SI-NEXT:    s_lshr_b32 s52, s19, 16
+; SI-NEXT:    s_lshr_b32 s53, s17, 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[78:79], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[88:89], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[90:91], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[92:93], s[16:17], 16
 ; SI-NEXT:  .LBB13_3: ; %end
-; SI-NEXT:    s_and_b32 s4, s16, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT:    v_or_b32_e32 v14, s4, v14
-; SI-NEXT:    s_and_b32 s4, s17, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s73, 16
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_mov_b32_e32 v15, s4
-; SI-NEXT:    s_and_b32 s4, s18, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 4, v0
-; SI-NEXT:    v_or_b32_e32 v13, s4, v13
-; SI-NEXT:    s_and_b32 s4, s19, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s72, 16
-; SI-NEXT:    buffer_store_dword v15, v14, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 8, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v14, s4
-; SI-NEXT:    s_and_b32 s4, s20, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; SI-NEXT:    s_lshl_b32 s43, s92, 16
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s43
+; SI-NEXT:    v_mov_b32_e32 v1, s16
+; SI-NEXT:    s_and_b32 s16, s17, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s53, 16
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_lshl_b32 s16, s90, 16
+; SI-NEXT:    s_and_b32 s17, s18, 0xffff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_mov_b32_e32 v3, s16
+; SI-NEXT:    s_and_b32 s16, s19, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s52, 16
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v4, s16
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 12, v0
-; SI-NEXT:    v_or_b32_e32 v12, s4, v12
-; SI-NEXT:    s_and_b32 s4, s21, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s63, 16
-; SI-NEXT:    buffer_store_dword v14, v13, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 16, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v12, v13, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v13, s4
-; SI-NEXT:    s_and_b32 s4, s22, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; SI-NEXT:    s_lshl_b32 s16, s88, 16
+; SI-NEXT:    s_and_b32 s17, s20, 0xffff
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s21, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s51, 16
+; SI-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 20, v0
-; SI-NEXT:    v_or_b32_e32 v11, s4, v11
-; SI-NEXT:    s_and_b32 s4, s23, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s62, 16
-; SI-NEXT:    buffer_store_dword v13, v12, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 24, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v11, v12, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v12, s4
-; SI-NEXT:    s_and_b32 s4, s24, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s22, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s78, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 20, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 28, v0
-; SI-NEXT:    v_or_b32_e32 v10, s4, v10
-; SI-NEXT:    s_and_b32 s4, s25, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s61, 16
-; SI-NEXT:    buffer_store_dword v12, v11, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 32, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v11, s4
-; SI-NEXT:    s_and_b32 s4, s26, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s23, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s50, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 24, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 36, v0
-; SI-NEXT:    v_or_b32_e32 v9, s4, v9
-; SI-NEXT:    s_and_b32 s4, s27, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s60, 16
-; SI-NEXT:    buffer_store_dword v11, v10, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 40, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v10, s4
-; SI-NEXT:    s_and_b32 s4, s28, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s24, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s76, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 44, v0
-; SI-NEXT:    v_or_b32_e32 v8, s4, v8
-; SI-NEXT:    s_and_b32 s4, s29, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s59, 16
-; SI-NEXT:    buffer_store_dword v10, v9, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v8, v9, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v9, s4
-; SI-NEXT:    s_and_b32 s4, s43, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s25, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s49, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 52, v0
-; SI-NEXT:    v_or_b32_e32 v7, s4, v7
-; SI-NEXT:    s_and_b32 s4, s42, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s58, 16
-; SI-NEXT:    buffer_store_dword v9, v8, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 56, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v8, s4
-; SI-NEXT:    s_and_b32 s4, s41, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s26, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s74, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 36, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 60, v0
-; SI-NEXT:    v_or_b32_e32 v6, s4, v6
-; SI-NEXT:    s_and_b32 s4, s40, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s57, 16
-; SI-NEXT:    buffer_store_dword v8, v7, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 64, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    s_and_b32 s4, s15, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s27, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s48, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 40, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x44, v0
-; SI-NEXT:    v_or_b32_e32 v5, s4, v5
-; SI-NEXT:    s_and_b32 s4, s14, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s56, 16
-; SI-NEXT:    buffer_store_dword v7, v6, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v6, s4
-; SI-NEXT:    s_and_b32 s4, s13, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s28, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s72, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 44, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x4c, v0
-; SI-NEXT:    v_or_b32_e32 v4, s4, v4
-; SI-NEXT:    s_and_b32 s4, s12, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s47, 16
-; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x50, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v5, s4
-; SI-NEXT:    s_and_b32 s4, s11, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s29, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s39, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 48, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x54, v0
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s10, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s46, 16
-; SI-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x58, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v4, s4
-; SI-NEXT:    s_and_b32 s4, s9, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s40, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s62, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 52, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x5c, v0
-; SI-NEXT:    v_or_b32_e32 v2, s4, v2
-; SI-NEXT:    s_and_b32 s4, s8, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s45, 16
-; SI-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x60, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v3, s4
-; SI-NEXT:    s_and_b32 s4, s7, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s41, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s38, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 56, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
-; SI-NEXT:    v_or_b32_e32 v1, s4, v1
-; SI-NEXT:    s_and_b32 s4, s6, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s44, 16
-; SI-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s14, s14, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s60, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 60, v0
+; SI-NEXT:    s_or_b32 s14, s14, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s14
+; SI-NEXT:    s_and_b32 s14, s15, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s37, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 64, v0
+; SI-NEXT:    s_or_b32 s14, s14, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s14
+; SI-NEXT:    s_and_b32 s12, s12, 0xffff
+; SI-NEXT:    s_lshl_b32 s14, s58, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x44, v0
+; SI-NEXT:    s_or_b32 s12, s12, s14
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    s_and_b32 s12, s13, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s36, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x48, v0
+; SI-NEXT:    s_or_b32 s12, s12, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_lshl_b32 s12, s56, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x4c, v0
+; SI-NEXT:    s_or_b32 s10, s10, s12
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s10, s11, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s35, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x50, v0
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    s_lshl_b32 s10, s46, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x54, v0
+; SI-NEXT:    s_or_b32 s8, s8, s10
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s8, s9, 0xffff
+; SI-NEXT:    s_lshl_b32 s9, s34, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x58, v0
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_lshl_b32 s8, s44, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x5c, v0
+; SI-NEXT:    s_or_b32 s6, s6, s8
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s6, s7, 0xffff
+; SI-NEXT:    s_lshl_b32 s7, s31, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x60, v0
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; SI-NEXT:    s_lshl_b32 s6, s42, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x64, v0
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    s_and_b32 s4, s5, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s30, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x68, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    v_readlane_b32 s53, v16, 13
+; SI-NEXT:    v_readlane_b32 s52, v16, 12
+; SI-NEXT:    v_readlane_b32 s51, v16, 11
+; SI-NEXT:    v_readlane_b32 s50, v16, 10
+; SI-NEXT:    v_readlane_b32 s49, v16, 9
+; SI-NEXT:    v_readlane_b32 s48, v16, 8
+; SI-NEXT:    v_readlane_b32 s39, v16, 7
+; SI-NEXT:    v_readlane_b32 s38, v16, 6
+; SI-NEXT:    v_readlane_b32 s37, v16, 5
+; SI-NEXT:    v_readlane_b32 s36, v16, 4
+; SI-NEXT:    v_readlane_b32 s35, v16, 3
+; SI-NEXT:    v_readlane_b32 s34, v16, 2
+; SI-NEXT:    v_readlane_b32 s31, v16, 1
+; SI-NEXT:    v_readlane_b32 s30, v16, 0
+; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB13_4:
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $sgpr73
-; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $sgpr92
+; SI-NEXT:    ; implicit-def: $sgpr53
+; SI-NEXT:    ; implicit-def: $sgpr90
+; SI-NEXT:    ; implicit-def: $sgpr52
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    ; implicit-def: $sgpr51
+; SI-NEXT:    ; implicit-def: $sgpr78
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; implicit-def: $sgpr76
+; SI-NEXT:    ; implicit-def: $sgpr49
+; SI-NEXT:    ; implicit-def: $sgpr74
+; SI-NEXT:    ; implicit-def: $sgpr48
 ; SI-NEXT:    ; implicit-def: $sgpr72
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $sgpr63
-; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $sgpr39
 ; SI-NEXT:    ; implicit-def: $sgpr62
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $sgpr61
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $sgpr38
 ; SI-NEXT:    ; implicit-def: $sgpr60
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $sgpr59
-; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $sgpr37
 ; SI-NEXT:    ; implicit-def: $sgpr58
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $sgpr57
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $sgpr36
 ; SI-NEXT:    ; implicit-def: $sgpr56
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $sgpr47
-; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $sgpr35
 ; SI-NEXT:    ; implicit-def: $sgpr46
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $sgpr45
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr34
 ; SI-NEXT:    ; implicit-def: $sgpr44
+; SI-NEXT:    ; implicit-def: $sgpr31
+; SI-NEXT:    ; implicit-def: $sgpr42
+; SI-NEXT:    ; implicit-def: $sgpr30
 ; SI-NEXT:    s_branch .LBB13_2
 ;
 ; VI-LABEL: bitcast_v28i32_to_v56i16_scalar:
@@ -14769,223 +14800,227 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
-; SI-NEXT:    v_mov_b32_e32 v30, s16
-; SI-NEXT:    v_mov_b32_e32 v29, s17
-; SI-NEXT:    v_mov_b32_e32 v25, s18
-; SI-NEXT:    v_mov_b32_e32 v23, s19
-; SI-NEXT:    v_mov_b32_e32 v28, s20
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    v_mov_b32_e32 v27, s16
+; SI-NEXT:    v_mov_b32_e32 v28, s17
+; SI-NEXT:    v_mov_b32_e32 v23, s18
+; SI-NEXT:    v_mov_b32_e32 v24, s19
+; SI-NEXT:    v_mov_b32_e32 v25, s20
 ; SI-NEXT:    v_mov_b32_e32 v26, s21
-; SI-NEXT:    v_mov_b32_e32 v24, s22
+; SI-NEXT:    v_mov_b32_e32 v21, s22
 ; SI-NEXT:    v_mov_b32_e32 v22, s23
-; SI-NEXT:    v_mov_b32_e32 v20, s24
-; SI-NEXT:    v_mov_b32_e32 v19, s25
-; SI-NEXT:    v_mov_b32_e32 v18, s26
-; SI-NEXT:    v_mov_b32_e32 v17, s27
-; SI-NEXT:    v_mov_b32_e32 v16, s28
-; SI-NEXT:    v_mov_b32_e32 v15, s29
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v19, s24
+; SI-NEXT:    v_mov_b32_e32 v20, s25
+; SI-NEXT:    v_mov_b32_e32 v17, s26
+; SI-NEXT:    v_mov_b32_e32 v18, s27
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    v_mov_b32_e32 v15, s28
+; SI-NEXT:    v_mov_b32_e32 v16, s29
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB29_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_alignbit_b32 v21, v14, v13, 16
-; SI-NEXT:    v_alignbit_b32 v27, v12, v11, 16
-; SI-NEXT:    v_alignbit_b32 v31, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v32, v8, v7, 16
-; SI-NEXT:    v_alignbit_b32 v33, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v34, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v37, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v39, v15, v16, 16
-; SI-NEXT:    v_alignbit_b32 v49, v17, v18, 16
-; SI-NEXT:    v_alignbit_b32 v51, v19, v20, 16
-; SI-NEXT:    v_alignbit_b32 v54, v22, v24, 16
-; SI-NEXT:    v_alignbit_b32 v40, v26, v28, 16
+; SI-NEXT:    v_lshr_b64 v[29:30], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[30:31], v[11:12], 16
+; SI-NEXT:    v_lshr_b64 v[31:32], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[32:33], v[7:8], 16
+; SI-NEXT:    v_lshr_b64 v[48:49], v[19:20], 16
+; SI-NEXT:    v_lshr_b64 v[33:34], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[49:50], v[21:22], 16
+; SI-NEXT:    v_lshr_b64 v[34:35], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[50:51], v[25:26], 16
+; SI-NEXT:    v_lshr_b64 v[35:36], v[15:16], 16
+; SI-NEXT:    v_lshr_b64 v[51:52], v[23:24], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v2
 ; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    v_alignbit_b32 v42, v23, v25, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v16
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v18
 ; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_alignbit_b32 v44, v29, v30, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v20
 ; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v22
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v26
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v24
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v28
+; SI-NEXT:    v_lshr_b64 v[38:39], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[36:37], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[52:53], v[27:28], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB29_3
 ; SI-NEXT:  .LBB29_2: ; %cmp.true
-; SI-NEXT:    v_add_f32_e32 v29, 1.0, v29
-; SI-NEXT:    v_add_f32_e32 v30, 1.0, v30
-; SI-NEXT:    v_add_f32_e32 v23, 1.0, v23
-; SI-NEXT:    v_add_f32_e32 v25, 1.0, v25
-; SI-NEXT:    v_add_f32_e32 v26, 1.0, v26
-; SI-NEXT:    v_add_f32_e32 v28, 1.0, v28
+; SI-NEXT:    v_add_f32_e32 v14, 1.0, v14
+; SI-NEXT:    v_add_f32_e32 v13, 1.0, v13
+; SI-NEXT:    v_add_f32_e32 v12, 1.0, v12
+; SI-NEXT:    v_add_f32_e32 v11, 1.0, v11
+; SI-NEXT:    v_lshr_b64 v[29:30], v[13:14], 16
+; SI-NEXT:    v_add_f32_e32 v10, 1.0, v10
+; SI-NEXT:    v_add_f32_e32 v9, 1.0, v9
+; SI-NEXT:    v_lshr_b64 v[30:31], v[11:12], 16
+; SI-NEXT:    v_add_f32_e32 v20, 1.0, v20
+; SI-NEXT:    v_add_f32_e32 v19, 1.0, v19
+; SI-NEXT:    v_add_f32_e32 v8, 1.0, v8
+; SI-NEXT:    v_add_f32_e32 v7, 1.0, v7
+; SI-NEXT:    v_lshr_b64 v[31:32], v[9:10], 16
 ; SI-NEXT:    v_add_f32_e32 v22, 1.0, v22
+; SI-NEXT:    v_add_f32_e32 v21, 1.0, v21
+; SI-NEXT:    v_add_f32_e32 v6, 1.0, v6
+; SI-NEXT:    v_add_f32_e32 v5, 1.0, v5
+; SI-NEXT:    v_lshr_b64 v[32:33], v[7:8], 16
+; SI-NEXT:    v_lshr_b64 v[48:49], v[19:20], 16
+; SI-NEXT:    v_add_f32_e32 v26, 1.0, v26
+; SI-NEXT:    v_add_f32_e32 v25, 1.0, v25
+; SI-NEXT:    v_add_f32_e32 v4, 1.0, v4
+; SI-NEXT:    v_add_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_lshr_b64 v[33:34], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[49:50], v[21:22], 16
 ; SI-NEXT:    v_add_f32_e32 v24, 1.0, v24
-; SI-NEXT:    v_add_f32_e32 v19, 1.0, v19
-; SI-NEXT:    v_add_f32_e32 v20, 1.0, v20
-; SI-NEXT:    v_add_f32_e32 v17, 1.0, v17
-; SI-NEXT:    v_add_f32_e32 v18, 1.0, v18
-; SI-NEXT:    v_add_f32_e32 v15, 1.0, v15
+; SI-NEXT:    v_add_f32_e32 v23, 1.0, v23
 ; SI-NEXT:    v_add_f32_e32 v16, 1.0, v16
+; SI-NEXT:    v_add_f32_e32 v15, 1.0, v15
+; SI-NEXT:    v_lshr_b64 v[34:35], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[50:51], v[25:26], 16
+; SI-NEXT:    v_add_f32_e32 v28, 1.0, v28
+; SI-NEXT:    v_add_f32_e32 v27, 1.0, v27
+; SI-NEXT:    v_add_f32_e32 v18, 1.0, v18
+; SI-NEXT:    v_add_f32_e32 v17, 1.0, v17
 ; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_add_f32_e32 v4, 1.0, v4
-; SI-NEXT:    v_add_f32_e32 v3, 1.0, v3
-; SI-NEXT:    v_add_f32_e32 v6, 1.0, v6
-; SI-NEXT:    v_add_f32_e32 v5, 1.0, v5
-; SI-NEXT:    v_add_f32_e32 v8, 1.0, v8
-; SI-NEXT:    v_add_f32_e32 v7, 1.0, v7
-; SI-NEXT:    v_add_f32_e32 v10, 1.0, v10
-; SI-NEXT:    v_add_f32_e32 v9, 1.0, v9
-; SI-NEXT:    v_add_f32_e32 v12, 1.0, v12
-; SI-NEXT:    v_add_f32_e32 v11, 1.0, v11
-; SI-NEXT:    v_add_f32_e32 v14, 1.0, v14
-; SI-NEXT:    v_add_f32_e32 v13, 1.0, v13
-; SI-NEXT:    v_alignbit_b32 v21, v14, v13, 16
-; SI-NEXT:    v_alignbit_b32 v27, v12, v11, 16
-; SI-NEXT:    v_alignbit_b32 v31, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v32, v8, v7, 16
-; SI-NEXT:    v_alignbit_b32 v33, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v34, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v37, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v39, v15, v16, 16
-; SI-NEXT:    v_alignbit_b32 v49, v17, v18, 16
-; SI-NEXT:    v_alignbit_b32 v51, v19, v20, 16
-; SI-NEXT:    v_alignbit_b32 v54, v22, v24, 16
-; SI-NEXT:    v_alignbit_b32 v40, v26, v28, 16
+; SI-NEXT:    v_lshr_b64 v[35:36], v[15:16], 16
+; SI-NEXT:    v_lshr_b64 v[51:52], v[23:24], 16
+; SI-NEXT:    v_lshr_b64 v[38:39], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[36:37], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[52:53], v[27:28], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v2
 ; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    v_alignbit_b32 v42, v23, v25, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v16
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v18
 ; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_alignbit_b32 v44, v29, v30, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v15
-; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v19
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v20
 ; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v22
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v26
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v23
+; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v24
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v29
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v28
 ; SI-NEXT:  .LBB29_3: ; %end
-; SI-NEXT:    v_and_b32_e32 v30, 0xffff, v30
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v44
-; SI-NEXT:    v_or_b32_e32 v30, v30, v44
-; SI-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v29, 0xffff, v29
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v56
-; SI-NEXT:    v_or_b32_e32 v29, v29, v30
-; SI-NEXT:    v_add_i32_e32 v30, vcc, 4, v0
-; SI-NEXT:    buffer_store_dword v29, v30, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v25
+; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v52
+; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; SI-NEXT:    v_or_b32_e32 v27, v27, v37
+; SI-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v28
+; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v59
+; SI-NEXT:    v_or_b32_e32 v27, v27, v28
+; SI-NEXT:    v_add_i32_e32 v28, vcc, 4, v0
+; SI-NEXT:    buffer_store_dword v27, v28, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v42
-; SI-NEXT:    v_or_b32_e32 v25, v25, v29
-; SI-NEXT:    v_add_i32_e32 v29, vcc, 8, v0
-; SI-NEXT:    buffer_store_dword v25, v29, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v51
 ; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; SI-NEXT:    v_or_b32_e32 v23, v23, v27
+; SI-NEXT:    v_add_i32_e32 v27, vcc, 8, v0
+; SI-NEXT:    buffer_store_dword v23, v27, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v47
-; SI-NEXT:    v_or_b32_e32 v23, v23, v25
-; SI-NEXT:    v_add_i32_e32 v25, vcc, 12, v0
-; SI-NEXT:    buffer_store_dword v23, v25, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v24
+; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v58
+; SI-NEXT:    v_or_b32_e32 v23, v23, v24
+; SI-NEXT:    v_add_i32_e32 v24, vcc, 12, v0
+; SI-NEXT:    buffer_store_dword v23, v24, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v40
-; SI-NEXT:    v_or_b32_e32 v23, v23, v25
-; SI-NEXT:    v_add_i32_e32 v25, vcc, 16, v0
-; SI-NEXT:    buffer_store_dword v23, v25, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v50
+; SI-NEXT:    v_and_b32_e32 v24, 0xffff, v25
+; SI-NEXT:    v_or_b32_e32 v23, v24, v23
+; SI-NEXT:    v_add_i32_e32 v24, vcc, 16, v0
+; SI-NEXT:    buffer_store_dword v23, v24, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v46
-; SI-NEXT:    v_or_b32_e32 v23, v23, v25
-; SI-NEXT:    v_add_i32_e32 v25, vcc, 20, v0
-; SI-NEXT:    buffer_store_dword v23, v25, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v54
+; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v57
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v24
-; SI-NEXT:    v_add_i32_e32 v24, vcc, 24, v0
+; SI-NEXT:    v_add_i32_e32 v24, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v23, v24, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v22, 0xffff, v22
+; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v21
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v45
-; SI-NEXT:    v_or_b32_e32 v22, v22, v23
-; SI-NEXT:    v_add_i32_e32 v23, vcc, 28, v0
-; SI-NEXT:    buffer_store_dword v22, v23, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v20
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v49
+; SI-NEXT:    v_or_b32_e32 v21, v21, v23
+; SI-NEXT:    v_add_i32_e32 v23, vcc, 24, v0
+; SI-NEXT:    buffer_store_dword v21, v23, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v51
-; SI-NEXT:    v_or_b32_e32 v20, v20, v22
-; SI-NEXT:    v_add_i32_e32 v22, vcc, 32, v0
-; SI-NEXT:    buffer_store_dword v20, v22, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v56
+; SI-NEXT:    v_or_b32_e32 v21, v21, v22
+; SI-NEXT:    v_add_i32_e32 v22, vcc, 28, v0
+; SI-NEXT:    buffer_store_dword v21, v22, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v43
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v48
+; SI-NEXT:    v_or_b32_e32 v19, v19, v21
+; SI-NEXT:    v_add_i32_e32 v21, vcc, 32, v0
+; SI-NEXT:    buffer_store_dword v19, v21, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v20
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v47
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v20
 ; SI-NEXT:    v_add_i32_e32 v20, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v49
-; SI-NEXT:    v_or_b32_e32 v18, v18, v19
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v36
+; SI-NEXT:    v_or_b32_e32 v17, v17, v19
 ; SI-NEXT:    v_add_i32_e32 v19, vcc, 40, v0
-; SI-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; SI-NEXT:    buffer_store_dword v17, v19, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v41
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v46
 ; SI-NEXT:    v_or_b32_e32 v17, v17, v18
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v39
-; SI-NEXT:    v_or_b32_e32 v16, v16, v17
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v35
+; SI-NEXT:    v_or_b32_e32 v15, v15, v17
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, 48, v0
-; SI-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
+; SI-NEXT:    buffer_store_dword v15, v17, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v55
+; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v45
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v16
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v37
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v38
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v15
 ; SI-NEXT:    v_add_i32_e32 v15, vcc, 56, v0
 ; SI-NEXT:    buffer_store_dword v1, v15, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v53
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v44
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -14997,7 +15032,7 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v52
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v43
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -15009,7 +15044,7 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v50
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v42
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -15021,7 +15056,7 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v48
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -15033,74 +15068,77 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v38
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v30
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v36
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v55
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v21
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v29
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v35
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v54
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB29_4:
-; SI-NEXT:    ; implicit-def: $vgpr44
+; SI-NEXT:    ; implicit-def: $vgpr52
+; SI-NEXT:    ; implicit-def: $vgpr59
+; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    ; implicit-def: $vgpr58
+; SI-NEXT:    ; implicit-def: $vgpr50
+; SI-NEXT:    ; implicit-def: $vgpr57
+; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; implicit-def: $vgpr56
-; SI-NEXT:    ; implicit-def: $vgpr42
+; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr47
-; SI-NEXT:    ; implicit-def: $vgpr40
+; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr46
-; SI-NEXT:    ; implicit-def: $vgpr54
+; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    ; implicit-def: $vgpr44
 ; SI-NEXT:    ; implicit-def: $vgpr43
-; SI-NEXT:    ; implicit-def: $vgpr49
+; SI-NEXT:    ; implicit-def: $vgpr42
 ; SI-NEXT:    ; implicit-def: $vgpr41
-; SI-NEXT:    ; implicit-def: $vgpr39
+; SI-NEXT:    ; implicit-def: $vgpr40
 ; SI-NEXT:    ; implicit-def: $vgpr55
-; SI-NEXT:    ; implicit-def: $vgpr37
-; SI-NEXT:    ; implicit-def: $vgpr53
+; SI-NEXT:    ; implicit-def: $vgpr54
+; SI-NEXT:    ; implicit-def: $vgpr38
 ; SI-NEXT:    ; implicit-def: $vgpr34
-; SI-NEXT:    ; implicit-def: $vgpr52
 ; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr38
-; SI-NEXT:    ; implicit-def: $vgpr27
-; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr21
-; SI-NEXT:    ; implicit-def: $vgpr35
+; SI-NEXT:    ; implicit-def: $vgpr30
+; SI-NEXT:    ; implicit-def: $vgpr29
 ; SI-NEXT:    s_branch .LBB29_2
 ;
 ; VI-LABEL: bitcast_v28f32_to_v56i16_scalar:
@@ -25036,337 +25074,368 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3
 ; SI-LABEL: bitcast_v14i64_to_v56i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_writelane_b32 v16, s30, 0
+; SI-NEXT:    v_writelane_b32 v16, s31, 1
+; SI-NEXT:    v_writelane_b32 v16, s34, 2
+; SI-NEXT:    v_writelane_b32 v16, s35, 3
+; SI-NEXT:    v_writelane_b32 v16, s36, 4
+; SI-NEXT:    v_writelane_b32 v16, s37, 5
+; SI-NEXT:    v_writelane_b32 v16, s38, 6
+; SI-NEXT:    v_writelane_b32 v16, s39, 7
+; SI-NEXT:    v_writelane_b32 v16, s48, 8
+; SI-NEXT:    v_writelane_b32 v16, s49, 9
+; SI-NEXT:    v_writelane_b32 v16, s50, 10
+; SI-NEXT:    v_writelane_b32 v16, s51, 11
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
-; SI-NEXT:    v_readfirstlane_b32 s43, v1
-; SI-NEXT:    v_readfirstlane_b32 s42, v2
-; SI-NEXT:    v_readfirstlane_b32 s41, v3
-; SI-NEXT:    v_readfirstlane_b32 s40, v4
-; SI-NEXT:    v_readfirstlane_b32 s15, v5
-; SI-NEXT:    v_readfirstlane_b32 s14, v6
-; SI-NEXT:    v_readfirstlane_b32 s13, v7
-; SI-NEXT:    v_readfirstlane_b32 s12, v8
-; SI-NEXT:    v_readfirstlane_b32 s11, v9
-; SI-NEXT:    v_readfirstlane_b32 s10, v10
-; SI-NEXT:    v_readfirstlane_b32 s9, v11
-; SI-NEXT:    v_readfirstlane_b32 s8, v12
-; SI-NEXT:    v_readfirstlane_b32 s7, v13
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_readfirstlane_b32 s6, v14
+; SI-NEXT:    v_writelane_b32 v16, s52, 12
+; SI-NEXT:    v_readfirstlane_b32 s40, v1
+; SI-NEXT:    v_readfirstlane_b32 s41, v2
+; SI-NEXT:    v_readfirstlane_b32 s14, v3
+; SI-NEXT:    v_readfirstlane_b32 s15, v4
+; SI-NEXT:    v_readfirstlane_b32 s12, v5
+; SI-NEXT:    v_readfirstlane_b32 s13, v6
+; SI-NEXT:    v_readfirstlane_b32 s10, v7
+; SI-NEXT:    v_readfirstlane_b32 s11, v8
+; SI-NEXT:    v_readfirstlane_b32 s8, v9
+; SI-NEXT:    v_readfirstlane_b32 s9, v10
+; SI-NEXT:    v_readfirstlane_b32 s6, v11
+; SI-NEXT:    v_readfirstlane_b32 s7, v12
+; SI-NEXT:    v_readfirstlane_b32 s4, v13
+; SI-NEXT:    s_and_b64 s[42:43], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s5, v14
+; SI-NEXT:    v_writelane_b32 v16, s53, 13
 ; SI-NEXT:    s_cbranch_scc0 .LBB41_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s13
-; SI-NEXT:    v_mov_b32_e32 v5, s15
-; SI-NEXT:    v_mov_b32_e32 v6, s41
-; SI-NEXT:    v_mov_b32_e32 v7, s43
-; SI-NEXT:    v_mov_b32_e32 v8, s28
-; SI-NEXT:    v_mov_b32_e32 v9, s26
-; SI-NEXT:    v_mov_b32_e32 v10, s24
-; SI-NEXT:    v_mov_b32_e32 v11, s22
-; SI-NEXT:    v_mov_b32_e32 v12, s20
-; SI-NEXT:    v_mov_b32_e32 v13, s18
-; SI-NEXT:    v_mov_b32_e32 v14, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s10, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s12, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s14, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s40, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s42, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s29, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s27, v9, 16
-; SI-NEXT:    v_alignbit_b32 v10, s25, v10, 16
-; SI-NEXT:    v_alignbit_b32 v11, s23, v11, 16
-; SI-NEXT:    v_alignbit_b32 v12, s21, v12, 16
-; SI-NEXT:    v_alignbit_b32 v13, s19, v13, 16
-; SI-NEXT:    v_alignbit_b32 v14, s17, v14, 16
-; SI-NEXT:    s_lshr_b32 s44, s6, 16
-; SI-NEXT:    s_lshr_b32 s45, s8, 16
-; SI-NEXT:    s_lshr_b32 s46, s10, 16
-; SI-NEXT:    s_lshr_b32 s47, s12, 16
-; SI-NEXT:    s_lshr_b32 s56, s14, 16
-; SI-NEXT:    s_lshr_b32 s57, s40, 16
-; SI-NEXT:    s_lshr_b32 s58, s42, 16
-; SI-NEXT:    s_lshr_b32 s59, s29, 16
-; SI-NEXT:    s_lshr_b32 s60, s27, 16
-; SI-NEXT:    s_lshr_b32 s61, s25, 16
-; SI-NEXT:    s_lshr_b32 s62, s23, 16
-; SI-NEXT:    s_lshr_b32 s63, s21, 16
-; SI-NEXT:    s_lshr_b32 s72, s19, 16
-; SI-NEXT:    s_lshr_b32 s73, s17, 16
+; SI-NEXT:    s_lshr_b32 s30, s5, 16
+; SI-NEXT:    s_lshr_b32 s31, s7, 16
+; SI-NEXT:    s_lshr_b32 s34, s9, 16
+; SI-NEXT:    s_lshr_b32 s35, s11, 16
+; SI-NEXT:    s_lshr_b32 s36, s13, 16
+; SI-NEXT:    s_lshr_b32 s37, s15, 16
+; SI-NEXT:    s_lshr_b32 s38, s41, 16
+; SI-NEXT:    s_lshr_b32 s39, s29, 16
+; SI-NEXT:    s_lshr_b32 s48, s27, 16
+; SI-NEXT:    s_lshr_b32 s49, s25, 16
+; SI-NEXT:    s_lshr_b32 s50, s23, 16
+; SI-NEXT:    s_lshr_b32 s51, s21, 16
+; SI-NEXT:    s_lshr_b32 s52, s19, 16
+; SI-NEXT:    s_lshr_b32 s53, s17, 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[14:15], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[40:41], 16
+; SI-NEXT:    s_lshr_b64 s[72:73], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[74:75], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[78:79], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[88:89], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[90:91], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[92:93], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB41_3
 ; SI-NEXT:  .LBB41_2: ; %cmp.true
-; SI-NEXT:    s_add_u32 s16, s16, 3
-; SI-NEXT:    s_addc_u32 s17, s17, 0
-; SI-NEXT:    s_add_u32 s18, s18, 3
-; SI-NEXT:    s_addc_u32 s19, s19, 0
-; SI-NEXT:    s_add_u32 s20, s20, 3
-; SI-NEXT:    s_addc_u32 s21, s21, 0
-; SI-NEXT:    s_add_u32 s22, s22, 3
-; SI-NEXT:    s_addc_u32 s23, s23, 0
-; SI-NEXT:    s_add_u32 s24, s24, 3
-; SI-NEXT:    s_addc_u32 s25, s25, 0
-; SI-NEXT:    s_add_u32 s26, s26, 3
-; SI-NEXT:    s_addc_u32 s27, s27, 0
+; SI-NEXT:    s_add_u32 s4, s4, 3
+; SI-NEXT:    s_addc_u32 s5, s5, 0
+; SI-NEXT:    s_add_u32 s6, s6, 3
+; SI-NEXT:    s_addc_u32 s7, s7, 0
+; SI-NEXT:    s_add_u32 s8, s8, 3
+; SI-NEXT:    s_addc_u32 s9, s9, 0
+; SI-NEXT:    s_add_u32 s10, s10, 3
+; SI-NEXT:    s_addc_u32 s11, s11, 0
+; SI-NEXT:    s_add_u32 s12, s12, 3
+; SI-NEXT:    s_addc_u32 s13, s13, 0
+; SI-NEXT:    s_add_u32 s14, s14, 3
+; SI-NEXT:    s_addc_u32 s15, s15, 0
+; SI-NEXT:    s_add_u32 s40, s40, 3
+; SI-NEXT:    s_addc_u32 s41, s41, 0
 ; SI-NEXT:    s_add_u32 s28, s28, 3
 ; SI-NEXT:    s_addc_u32 s29, s29, 0
-; SI-NEXT:    s_add_u32 s43, s43, 3
-; SI-NEXT:    s_addc_u32 s42, s42, 0
-; SI-NEXT:    s_add_u32 s41, s41, 3
-; SI-NEXT:    s_addc_u32 s40, s40, 0
-; SI-NEXT:    s_add_u32 s15, s15, 3
-; SI-NEXT:    s_addc_u32 s14, s14, 0
-; SI-NEXT:    s_add_u32 s13, s13, 3
-; SI-NEXT:    s_addc_u32 s12, s12, 0
-; SI-NEXT:    s_add_u32 s11, s11, 3
-; SI-NEXT:    s_addc_u32 s10, s10, 0
-; SI-NEXT:    s_add_u32 s9, s9, 3
-; SI-NEXT:    s_addc_u32 s8, s8, 0
-; SI-NEXT:    s_add_u32 s7, s7, 3
-; SI-NEXT:    s_addc_u32 s6, s6, 0
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s13
-; SI-NEXT:    v_mov_b32_e32 v5, s15
-; SI-NEXT:    v_mov_b32_e32 v6, s41
-; SI-NEXT:    v_mov_b32_e32 v7, s43
-; SI-NEXT:    v_mov_b32_e32 v8, s28
-; SI-NEXT:    v_mov_b32_e32 v9, s26
-; SI-NEXT:    v_mov_b32_e32 v10, s24
-; SI-NEXT:    v_mov_b32_e32 v11, s22
-; SI-NEXT:    v_mov_b32_e32 v12, s20
-; SI-NEXT:    v_mov_b32_e32 v13, s18
-; SI-NEXT:    v_mov_b32_e32 v14, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s10, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s12, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s14, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s40, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s42, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s29, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s27, v9, 16
-; SI-NEXT:    v_alignbit_b32 v10, s25, v10, 16
-; SI-NEXT:    v_alignbit_b32 v11, s23, v11, 16
-; SI-NEXT:    v_alignbit_b32 v12, s21, v12, 16
-; SI-NEXT:    v_alignbit_b32 v13, s19, v13, 16
-; SI-NEXT:    v_alignbit_b32 v14, s17, v14, 16
-; SI-NEXT:    s_lshr_b32 s44, s6, 16
-; SI-NEXT:    s_lshr_b32 s45, s8, 16
-; SI-NEXT:    s_lshr_b32 s46, s10, 16
-; SI-NEXT:    s_lshr_b32 s47, s12, 16
-; SI-NEXT:    s_lshr_b32 s56, s14, 16
-; SI-NEXT:    s_lshr_b32 s57, s40, 16
-; SI-NEXT:    s_lshr_b32 s58, s42, 16
-; SI-NEXT:    s_lshr_b32 s59, s29, 16
-; SI-NEXT:    s_lshr_b32 s60, s27, 16
-; SI-NEXT:    s_lshr_b32 s61, s25, 16
-; SI-NEXT:    s_lshr_b32 s62, s23, 16
-; SI-NEXT:    s_lshr_b32 s63, s21, 16
-; SI-NEXT:    s_lshr_b32 s72, s19, 16
-; SI-NEXT:    s_lshr_b32 s73, s17, 16
+; SI-NEXT:    s_add_u32 s26, s26, 3
+; SI-NEXT:    s_addc_u32 s27, s27, 0
+; SI-NEXT:    s_add_u32 s24, s24, 3
+; SI-NEXT:    s_addc_u32 s25, s25, 0
+; SI-NEXT:    s_add_u32 s22, s22, 3
+; SI-NEXT:    s_addc_u32 s23, s23, 0
+; SI-NEXT:    s_add_u32 s20, s20, 3
+; SI-NEXT:    s_addc_u32 s21, s21, 0
+; SI-NEXT:    s_add_u32 s18, s18, 3
+; SI-NEXT:    s_addc_u32 s19, s19, 0
+; SI-NEXT:    s_add_u32 s16, s16, 3
+; SI-NEXT:    s_addc_u32 s17, s17, 0
+; SI-NEXT:    s_lshr_b32 s30, s5, 16
+; SI-NEXT:    s_lshr_b32 s31, s7, 16
+; SI-NEXT:    s_lshr_b32 s34, s9, 16
+; SI-NEXT:    s_lshr_b32 s35, s11, 16
+; SI-NEXT:    s_lshr_b32 s36, s13, 16
+; SI-NEXT:    s_lshr_b32 s37, s15, 16
+; SI-NEXT:    s_lshr_b32 s38, s41, 16
+; SI-NEXT:    s_lshr_b32 s39, s29, 16
+; SI-NEXT:    s_lshr_b32 s48, s27, 16
+; SI-NEXT:    s_lshr_b32 s49, s25, 16
+; SI-NEXT:    s_lshr_b32 s50, s23, 16
+; SI-NEXT:    s_lshr_b32 s51, s21, 16
+; SI-NEXT:    s_lshr_b32 s52, s19, 16
+; SI-NEXT:    s_lshr_b32 s53, s17, 16
+; SI-NEXT:    s_lshr_b64 s[42:43], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[14:15], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[40:41], 16
+; SI-NEXT:    s_lshr_b64 s[72:73], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[74:75], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[78:79], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[88:89], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[90:91], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[92:93], s[16:17], 16
 ; SI-NEXT:  .LBB41_3: ; %end
-; SI-NEXT:    s_and_b32 s4, s16, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT:    v_or_b32_e32 v14, s4, v14
-; SI-NEXT:    s_and_b32 s4, s17, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s73, 16
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_mov_b32_e32 v15, s4
-; SI-NEXT:    s_and_b32 s4, s18, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; SI-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 4, v0
-; SI-NEXT:    v_or_b32_e32 v13, s4, v13
-; SI-NEXT:    s_and_b32 s4, s19, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s72, 16
-; SI-NEXT:    buffer_store_dword v15, v14, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 8, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v14, s4
-; SI-NEXT:    s_and_b32 s4, s20, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; SI-NEXT:    s_lshl_b32 s43, s92, 16
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s43
+; SI-NEXT:    v_mov_b32_e32 v1, s16
+; SI-NEXT:    s_and_b32 s16, s17, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s53, 16
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_lshl_b32 s16, s90, 16
+; SI-NEXT:    s_and_b32 s17, s18, 0xffff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_mov_b32_e32 v3, s16
+; SI-NEXT:    s_and_b32 s16, s19, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s52, 16
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v4, s16
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 12, v0
-; SI-NEXT:    v_or_b32_e32 v12, s4, v12
-; SI-NEXT:    s_and_b32 s4, s21, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s63, 16
-; SI-NEXT:    buffer_store_dword v14, v13, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 16, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v12, v13, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v13, s4
-; SI-NEXT:    s_and_b32 s4, s22, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; SI-NEXT:    s_lshl_b32 s16, s88, 16
+; SI-NEXT:    s_and_b32 s17, s20, 0xffff
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s21, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s51, 16
+; SI-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 20, v0
-; SI-NEXT:    v_or_b32_e32 v11, s4, v11
-; SI-NEXT:    s_and_b32 s4, s23, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s62, 16
-; SI-NEXT:    buffer_store_dword v13, v12, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 24, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v11, v12, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v12, s4
-; SI-NEXT:    s_and_b32 s4, s24, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s22, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s78, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 20, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 28, v0
-; SI-NEXT:    v_or_b32_e32 v10, s4, v10
-; SI-NEXT:    s_and_b32 s4, s25, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s61, 16
-; SI-NEXT:    buffer_store_dword v12, v11, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 32, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v11, s4
-; SI-NEXT:    s_and_b32 s4, s26, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s23, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s50, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 24, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 36, v0
-; SI-NEXT:    v_or_b32_e32 v9, s4, v9
-; SI-NEXT:    s_and_b32 s4, s27, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s60, 16
-; SI-NEXT:    buffer_store_dword v11, v10, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 40, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v10, s4
-; SI-NEXT:    s_and_b32 s4, s28, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s24, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s76, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 44, v0
-; SI-NEXT:    v_or_b32_e32 v8, s4, v8
-; SI-NEXT:    s_and_b32 s4, s29, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s59, 16
-; SI-NEXT:    buffer_store_dword v10, v9, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v8, v9, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v9, s4
-; SI-NEXT:    s_and_b32 s4, s43, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s25, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s49, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 52, v0
-; SI-NEXT:    v_or_b32_e32 v7, s4, v7
-; SI-NEXT:    s_and_b32 s4, s42, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s58, 16
-; SI-NEXT:    buffer_store_dword v9, v8, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 56, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v8, s4
-; SI-NEXT:    s_and_b32 s4, s41, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s26, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s74, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 36, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 60, v0
-; SI-NEXT:    v_or_b32_e32 v6, s4, v6
-; SI-NEXT:    s_and_b32 s4, s40, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s57, 16
-; SI-NEXT:    buffer_store_dword v8, v7, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 64, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    s_and_b32 s4, s15, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s27, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s48, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 40, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x44, v0
-; SI-NEXT:    v_or_b32_e32 v5, s4, v5
-; SI-NEXT:    s_and_b32 s4, s14, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s56, 16
-; SI-NEXT:    buffer_store_dword v7, v6, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v6, s4
-; SI-NEXT:    s_and_b32 s4, s13, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s28, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s72, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 44, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x4c, v0
-; SI-NEXT:    v_or_b32_e32 v4, s4, v4
-; SI-NEXT:    s_and_b32 s4, s12, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s47, 16
-; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x50, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v5, s4
-; SI-NEXT:    s_and_b32 s4, s11, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s29, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s39, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 48, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x54, v0
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s10, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s46, 16
-; SI-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x58, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v4, s4
-; SI-NEXT:    s_and_b32 s4, s9, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s40, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s62, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 52, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x5c, v0
-; SI-NEXT:    v_or_b32_e32 v2, s4, v2
-; SI-NEXT:    s_and_b32 s4, s8, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s45, 16
-; SI-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x60, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v3, s4
-; SI-NEXT:    s_and_b32 s4, s7, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s41, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s38, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 56, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
-; SI-NEXT:    v_or_b32_e32 v1, s4, v1
-; SI-NEXT:    s_and_b32 s4, s6, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s44, 16
-; SI-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s14, s14, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s60, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 60, v0
+; SI-NEXT:    s_or_b32 s14, s14, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s14
+; SI-NEXT:    s_and_b32 s14, s15, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s37, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 64, v0
+; SI-NEXT:    s_or_b32 s14, s14, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s14
+; SI-NEXT:    s_and_b32 s12, s12, 0xffff
+; SI-NEXT:    s_lshl_b32 s14, s58, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x44, v0
+; SI-NEXT:    s_or_b32 s12, s12, s14
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    s_and_b32 s12, s13, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s36, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x48, v0
+; SI-NEXT:    s_or_b32 s12, s12, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_lshl_b32 s12, s56, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x4c, v0
+; SI-NEXT:    s_or_b32 s10, s10, s12
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s10, s11, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s35, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x50, v0
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    s_lshl_b32 s10, s46, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x54, v0
+; SI-NEXT:    s_or_b32 s8, s8, s10
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s8, s9, 0xffff
+; SI-NEXT:    s_lshl_b32 s9, s34, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x58, v0
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_lshl_b32 s8, s44, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x5c, v0
+; SI-NEXT:    s_or_b32 s6, s6, s8
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s6, s7, 0xffff
+; SI-NEXT:    s_lshl_b32 s7, s31, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x60, v0
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; SI-NEXT:    s_lshl_b32 s6, s42, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x64, v0
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    s_and_b32 s4, s5, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s30, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x68, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    v_readlane_b32 s53, v16, 13
+; SI-NEXT:    v_readlane_b32 s52, v16, 12
+; SI-NEXT:    v_readlane_b32 s51, v16, 11
+; SI-NEXT:    v_readlane_b32 s50, v16, 10
+; SI-NEXT:    v_readlane_b32 s49, v16, 9
+; SI-NEXT:    v_readlane_b32 s48, v16, 8
+; SI-NEXT:    v_readlane_b32 s39, v16, 7
+; SI-NEXT:    v_readlane_b32 s38, v16, 6
+; SI-NEXT:    v_readlane_b32 s37, v16, 5
+; SI-NEXT:    v_readlane_b32 s36, v16, 4
+; SI-NEXT:    v_readlane_b32 s35, v16, 3
+; SI-NEXT:    v_readlane_b32 s34, v16, 2
+; SI-NEXT:    v_readlane_b32 s31, v16, 1
+; SI-NEXT:    v_readlane_b32 s30, v16, 0
+; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB41_4:
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $sgpr73
-; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $sgpr92
+; SI-NEXT:    ; implicit-def: $sgpr53
+; SI-NEXT:    ; implicit-def: $sgpr90
+; SI-NEXT:    ; implicit-def: $sgpr52
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    ; implicit-def: $sgpr51
+; SI-NEXT:    ; implicit-def: $sgpr78
+; SI-NEXT:    ; implicit-def: $sgpr50
+; SI-NEXT:    ; implicit-def: $sgpr76
+; SI-NEXT:    ; implicit-def: $sgpr49
+; SI-NEXT:    ; implicit-def: $sgpr74
+; SI-NEXT:    ; implicit-def: $sgpr48
 ; SI-NEXT:    ; implicit-def: $sgpr72
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $sgpr63
-; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $sgpr39
 ; SI-NEXT:    ; implicit-def: $sgpr62
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $sgpr61
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $sgpr38
 ; SI-NEXT:    ; implicit-def: $sgpr60
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $sgpr59
-; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $sgpr37
 ; SI-NEXT:    ; implicit-def: $sgpr58
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $sgpr57
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $sgpr36
 ; SI-NEXT:    ; implicit-def: $sgpr56
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $sgpr47
-; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $sgpr35
 ; SI-NEXT:    ; implicit-def: $sgpr46
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $sgpr45
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr34
 ; SI-NEXT:    ; implicit-def: $sgpr44
+; SI-NEXT:    ; implicit-def: $sgpr31
+; SI-NEXT:    ; implicit-def: $sgpr42
+; SI-NEXT:    ; implicit-def: $sgpr30
 ; SI-NEXT:    s_branch .LBB41_2
 ;
 ; VI-LABEL: bitcast_v14i64_to_v56i16_scalar:
@@ -34418,194 +34487,198 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_mov_b32_e32 v15, s28
 ; SI-NEXT:    v_mov_b32_e32 v16, s29
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB49_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_alignbit_b32 v29, v14, v13, 16
-; SI-NEXT:    v_alignbit_b32 v30, v12, v11, 16
-; SI-NEXT:    v_alignbit_b32 v31, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v32, v8, v7, 16
-; SI-NEXT:    v_alignbit_b32 v33, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v34, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v36, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v38, v16, v15, 16
-; SI-NEXT:    v_alignbit_b32 v48, v18, v17, 16
-; SI-NEXT:    v_alignbit_b32 v51, v20, v19, 16
-; SI-NEXT:    v_alignbit_b32 v53, v22, v21, 16
-; SI-NEXT:    v_alignbit_b32 v40, v26, v25, 16
+; SI-NEXT:    v_lshr_b64 v[29:30], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[30:31], v[11:12], 16
+; SI-NEXT:    v_lshr_b64 v[31:32], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[32:33], v[7:8], 16
+; SI-NEXT:    v_lshr_b64 v[48:49], v[19:20], 16
+; SI-NEXT:    v_lshr_b64 v[33:34], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[49:50], v[21:22], 16
+; SI-NEXT:    v_lshr_b64 v[34:35], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[50:51], v[25:26], 16
+; SI-NEXT:    v_lshr_b64 v[35:36], v[15:16], 16
+; SI-NEXT:    v_lshr_b64 v[51:52], v[23:24], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v2
 ; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    v_alignbit_b32 v42, v24, v23, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v16
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v18
 ; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_alignbit_b32 v44, v28, v27, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v20
 ; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v22
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v26
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v24
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v28
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v28
+; SI-NEXT:    v_lshr_b64 v[38:39], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[36:37], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[52:53], v[27:28], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB49_3
 ; SI-NEXT:  .LBB49_2: ; %cmp.true
-; SI-NEXT:    v_add_f64 v[27:28], v[27:28], 1.0
-; SI-NEXT:    v_add_f64 v[23:24], v[23:24], 1.0
-; SI-NEXT:    v_add_f64 v[25:26], v[25:26], 1.0
-; SI-NEXT:    v_add_f64 v[21:22], v[21:22], 1.0
+; SI-NEXT:    v_add_f64 v[13:14], v[13:14], 1.0
+; SI-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
+; SI-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
+; SI-NEXT:    v_lshr_b64 v[29:30], v[13:14], 16
+; SI-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
 ; SI-NEXT:    v_add_f64 v[19:20], v[19:20], 1.0
-; SI-NEXT:    v_add_f64 v[17:18], v[17:18], 1.0
+; SI-NEXT:    v_lshr_b64 v[30:31], v[11:12], 16
+; SI-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
+; SI-NEXT:    v_add_f64 v[21:22], v[21:22], 1.0
+; SI-NEXT:    v_lshr_b64 v[31:32], v[9:10], 16
+; SI-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
+; SI-NEXT:    v_add_f64 v[25:26], v[25:26], 1.0
+; SI-NEXT:    v_lshr_b64 v[32:33], v[7:8], 16
+; SI-NEXT:    v_lshr_b64 v[48:49], v[19:20], 16
 ; SI-NEXT:    v_add_f64 v[15:16], v[15:16], 1.0
+; SI-NEXT:    v_add_f64 v[23:24], v[23:24], 1.0
+; SI-NEXT:    v_lshr_b64 v[33:34], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[49:50], v[21:22], 16
 ; SI-NEXT:    v_add_f64 v[1:2], v[1:2], 1.0
-; SI-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
-; SI-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
-; SI-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
-; SI-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
-; SI-NEXT:    v_add_f64 v[13:14], v[13:14], 1.0
-; SI-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
-; SI-NEXT:    v_alignbit_b32 v29, v14, v13, 16
-; SI-NEXT:    v_alignbit_b32 v30, v12, v11, 16
-; SI-NEXT:    v_alignbit_b32 v31, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v32, v8, v7, 16
-; SI-NEXT:    v_alignbit_b32 v33, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v34, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v36, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v38, v16, v15, 16
-; SI-NEXT:    v_alignbit_b32 v48, v18, v17, 16
-; SI-NEXT:    v_alignbit_b32 v51, v20, v19, 16
-; SI-NEXT:    v_alignbit_b32 v53, v22, v21, 16
-; SI-NEXT:    v_alignbit_b32 v40, v26, v25, 16
+; SI-NEXT:    v_add_f64 v[17:18], v[17:18], 1.0
+; SI-NEXT:    v_add_f64 v[27:28], v[27:28], 1.0
+; SI-NEXT:    v_lshr_b64 v[34:35], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[50:51], v[25:26], 16
+; SI-NEXT:    v_lshr_b64 v[35:36], v[15:16], 16
+; SI-NEXT:    v_lshr_b64 v[51:52], v[23:24], 16
+; SI-NEXT:    v_lshr_b64 v[38:39], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[36:37], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[52:53], v[27:28], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v2
 ; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    v_alignbit_b32 v42, v24, v23, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v16
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v18
 ; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_alignbit_b32 v44, v28, v27, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v35, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v20
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v20
 ; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v22
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v26
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v24
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v28
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v28
 ; SI-NEXT:  .LBB49_3: ; %end
+; SI-NEXT:    v_lshlrev_b32_e32 v37, 16, v52
 ; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v44, 16, v44
-; SI-NEXT:    v_or_b32_e32 v27, v27, v44
+; SI-NEXT:    v_or_b32_e32 v27, v27, v37
 ; SI-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v56
+; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v59
 ; SI-NEXT:    v_or_b32_e32 v27, v27, v28
 ; SI-NEXT:    v_add_i32_e32 v28, vcc, 4, v0
 ; SI-NEXT:    buffer_store_dword v27, v28, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v42
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v51
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v27
 ; SI-NEXT:    v_add_i32_e32 v27, vcc, 8, v0
 ; SI-NEXT:    buffer_store_dword v23, v27, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v47
+; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v58
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v24
 ; SI-NEXT:    v_add_i32_e32 v24, vcc, 12, v0
 ; SI-NEXT:    buffer_store_dword v23, v24, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v25
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v40
-; SI-NEXT:    v_or_b32_e32 v23, v23, v24
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v50
+; SI-NEXT:    v_and_b32_e32 v24, 0xffff, v25
+; SI-NEXT:    v_or_b32_e32 v23, v24, v23
 ; SI-NEXT:    v_add_i32_e32 v24, vcc, 16, v0
 ; SI-NEXT:    buffer_store_dword v23, v24, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v46
+; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v57
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v24
 ; SI-NEXT:    v_add_i32_e32 v24, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v23, v24, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v21
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v53
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v49
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v23
 ; SI-NEXT:    v_add_i32_e32 v23, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v21, v23, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v45
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v56
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v22
 ; SI-NEXT:    v_add_i32_e32 v22, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v21, v22, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v51
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v48
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v21
 ; SI-NEXT:    v_add_i32_e32 v21, vcc, 32, v0
 ; SI-NEXT:    buffer_store_dword v19, v21, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v43
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v47
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v20
 ; SI-NEXT:    v_add_i32_e32 v20, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v48
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v36
 ; SI-NEXT:    v_or_b32_e32 v17, v17, v19
 ; SI-NEXT:    v_add_i32_e32 v19, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v17, v19, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v41
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v46
 ; SI-NEXT:    v_or_b32_e32 v17, v17, v18
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v15
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v38
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v35
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v17
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, 48, v0
 ; SI-NEXT:    buffer_store_dword v15, v17, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v55
+; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v45
 ; SI-NEXT:    v_or_b32_e32 v15, v15, v16
 ; SI-NEXT:    v_add_i32_e32 v16, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v36
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v38
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v15
 ; SI-NEXT:    v_add_i32_e32 v15, vcc, 56, v0
 ; SI-NEXT:    buffer_store_dword v1, v15, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v54
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v44
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -34617,7 +34690,7 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v52
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v43
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -34629,7 +34702,7 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v50
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v42
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -34641,7 +34714,7 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v49
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -34653,7 +34726,7 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v39
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -34665,7 +34738,7 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v37
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v55
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -34677,50 +34750,53 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v35
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v54
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB49_4:
-; SI-NEXT:    ; implicit-def: $vgpr44
+; SI-NEXT:    ; implicit-def: $vgpr52
+; SI-NEXT:    ; implicit-def: $vgpr59
+; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    ; implicit-def: $vgpr58
+; SI-NEXT:    ; implicit-def: $vgpr50
+; SI-NEXT:    ; implicit-def: $vgpr57
+; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; implicit-def: $vgpr56
-; SI-NEXT:    ; implicit-def: $vgpr42
+; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr47
-; SI-NEXT:    ; implicit-def: $vgpr40
+; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr46
-; SI-NEXT:    ; implicit-def: $vgpr53
+; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    ; implicit-def: $vgpr44
 ; SI-NEXT:    ; implicit-def: $vgpr43
-; SI-NEXT:    ; implicit-def: $vgpr48
+; SI-NEXT:    ; implicit-def: $vgpr42
 ; SI-NEXT:    ; implicit-def: $vgpr41
-; SI-NEXT:    ; implicit-def: $vgpr38
+; SI-NEXT:    ; implicit-def: $vgpr40
 ; SI-NEXT:    ; implicit-def: $vgpr55
-; SI-NEXT:    ; implicit-def: $vgpr36
 ; SI-NEXT:    ; implicit-def: $vgpr54
+; SI-NEXT:    ; implicit-def: $vgpr38
 ; SI-NEXT:    ; implicit-def: $vgpr34
-; SI-NEXT:    ; implicit-def: $vgpr52
 ; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr39
 ; SI-NEXT:    ; implicit-def: $vgpr30
-; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; implicit-def: $vgpr29
-; SI-NEXT:    ; implicit-def: $vgpr35
 ; SI-NEXT:    s_branch .LBB49_2
 ;
 ; VI-LABEL: bitcast_v14f64_to_v56i16_scalar:
@@ -47126,503 +47202,582 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16
-; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20
-; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:40
 ; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:32
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:36
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:40
+; SI-NEXT:    v_cvt_f16_f32_e32 v59, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v44, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v51, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v43, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v38, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v42, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v31, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v41, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v40, v20
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v23
-; SI-NEXT:    v_cvt_f16_f32_e32 v55, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, v25
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v26
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, v27
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, v28
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, v29
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, v30
-; SI-NEXT:    v_cvt_f16_f32_e32 v28, s16
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, s18
-; SI-NEXT:    v_cvt_f16_f32_e32 v24, s19
-; SI-NEXT:    v_cvt_f16_f32_e32 v26, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v27, s22
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, s23
-; SI-NEXT:    v_cvt_f16_f32_e32 v23, s24
-; SI-NEXT:    v_cvt_f16_f32_e32 v20, s26
-; SI-NEXT:    v_cvt_f16_f32_e32 v19, s27
-; SI-NEXT:    v_cvt_f16_f32_e32 v22, s28
+; SI-NEXT:    v_cvt_f16_f32_e32 v63, v23
+; SI-NEXT:    v_cvt_f16_f32_e32 v57, v26
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v56, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v30
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, s16
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v22
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, s19
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, s23
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v23, s27
+; SI-NEXT:    v_cvt_f16_f32_e32 v30, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, v4
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, s28
+; SI-NEXT:    v_cvt_f16_f32_e32 v26, v5
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v22, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v12
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v17
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, s20
+; SI-NEXT:    v_cvt_f16_f32_e32 v48, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v21
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v25
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v29
+; SI-NEXT:    v_cvt_f16_f32_e32 v47, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v45, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, s22
+; SI-NEXT:    v_cvt_f16_f32_e32 v43, s25
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, s26
+; SI-NEXT:    v_cvt_f16_f32_e32 v41, s29
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v31
+; SI-NEXT:    v_cvt_f16_f32_e32 v46, v32
+; SI-NEXT:    v_cvt_f16_f32_e32 v42, v33
+; SI-NEXT:    v_cvt_f16_f32_e32 v44, v34
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v35
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, v36
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v37
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, s24
+; SI-NEXT:    v_cvt_f16_f32_e32 v37, v24
+; SI-NEXT:    v_cvt_f16_f32_e32 v35, v28
+; SI-NEXT:    s_waitcnt vmcnt(13)
+; SI-NEXT:    v_cvt_f16_f32_e32 v33, v38
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, v53
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v49
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v45
+; SI-NEXT:    v_cvt_f16_f32_e32 v62, v51
 ; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v46
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v53
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v47
-; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v56
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_cvt_f16_f32_e32 v30, v57
-; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v58
-; SI-NEXT:    s_waitcnt vmcnt(5)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v59
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_cvt_f16_f32_e32 v57, v60
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cvt_f16_f32_e32 v29, v61
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v62
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v63
-; SI-NEXT:    v_cvt_f16_f32_e32 v56, s17
-; SI-NEXT:    v_cvt_f16_f32_e32 v47, s21
-; SI-NEXT:    v_cvt_f16_f32_e32 v46, s25
-; SI-NEXT:    v_cvt_f16_f32_e32 v45, s29
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
+; SI-NEXT:    v_cvt_f16_f32_e32 v27, v55
+; SI-NEXT:    s_waitcnt vmcnt(8) expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v40
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, s18
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_cbranch_scc0 .LBB59_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_cbranch_execnz .LBB59_3
 ; SI-NEXT:  .LBB59_2: ; %cmp.true
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, v56
-; SI-NEXT:    v_cvt_f32_f16_e32 v47, v47
-; SI-NEXT:    v_cvt_f32_f16_e32 v45, v45
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v42
-; SI-NEXT:    v_add_f32_e32 v33, 0x38000000, v33
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v33
-; SI-NEXT:    v_add_f32_e32 v56, 0x38000000, v47
-; SI-NEXT:    v_add_f32_e32 v45, 0x38000000, v45
-; SI-NEXT:    v_add_f32_e32 v42, 0x38000000, v42
-; SI-NEXT:    v_lshlrev_b32_e32 v47, 16, v33
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v56
-; SI-NEXT:    v_cvt_f32_f16_e32 v56, v46
-; SI-NEXT:    v_cvt_f32_f16_e32 v55, v55
-; SI-NEXT:    v_cvt_f32_f16_e32 v30, v30
-; SI-NEXT:    v_lshlrev_b32_e32 v46, 16, v33
-; SI-NEXT:    v_add_f32_e32 v33, 0x38000000, v56
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v33
-; SI-NEXT:    v_cvt_f16_f32_e32 v56, v45
-; SI-NEXT:    v_add_f32_e32 v55, 0x38000000, v55
-; SI-NEXT:    v_mov_b32_e32 v61, v57
-; SI-NEXT:    v_lshlrev_b32_e32 v45, 16, v33
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, v44
-; SI-NEXT:    v_cvt_f32_f16_e32 v44, v43
-; SI-NEXT:    v_add_f32_e32 v30, 0x38000000, v30
-; SI-NEXT:    v_cvt_f32_f16_e32 v29, v29
-; SI-NEXT:    v_add_f32_e32 v33, 0x38000000, v33
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v33
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v47
+; SI-NEXT:    v_mov_b32_e32 v28, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v45
+; SI-NEXT:    v_mov_b32_e32 v51, v23
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v43
+; SI-NEXT:    v_mov_b32_e32 v49, v19
+; SI-NEXT:    v_mov_b32_e32 v53, v36
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v1
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_mov_b32_e32 v36, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v41
+; SI-NEXT:    v_mov_b32_e32 v38, v15
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v28
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v39
+; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v54
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_or_b32_e32 v5, v5, v23
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v52
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v50
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v48
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v37
+; SI-NEXT:    v_mov_b32_e32 v37, v11
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v35
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v62
+; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v33
+; SI-NEXT:    v_mov_b32_e32 v33, v31
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v53
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v43, 16, v33
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, v48
-; SI-NEXT:    v_add_f32_e32 v48, 0x38000000, v44
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v48
-; SI-NEXT:    v_cvt_f16_f32_e32 v44, v42
-; SI-NEXT:    v_add_f32_e32 v33, 0x38000000, v33
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v33
-; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v48
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    v_or_b32_e32 v48, v33, v42
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, v41
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v40
-; SI-NEXT:    v_add_f32_e32 v29, 0x38000000, v29
 ; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT:    v_add_f32_e32 v33, 0x38000000, v33
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v33
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v29, v29
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v33
-; SI-NEXT:    v_add_f32_e32 v33, 0x38000000, v41
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v33
-; SI-NEXT:    v_cvt_f16_f32_e32 v41, v55
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v55, 16, v33
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, v54
-; SI-NEXT:    v_cvt_f32_f16_e32 v54, v53
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT:    v_add_f32_e32 v33, 0x38000000, v33
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v33
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v53, 16, v33
-; SI-NEXT:    v_add_f32_e32 v33, 0x38000000, v54
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v33
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, v30
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v33
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, v61
-; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
-; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; SI-NEXT:    v_add_f32_e32 v33, 0x38000000, v33
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v33
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
-; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
-; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v60, 16, v54
-; SI-NEXT:    v_or_b32_e32 v54, v33, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
 ; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
-; SI-NEXT:    v_or_b32_e32 v4, v4, v33
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
 ; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v15
-; SI-NEXT:    v_or_b32_e32 v7, v7, v33
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-NEXT:    v_or_b32_e32 v6, v6, v33
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v11
-; SI-NEXT:    v_or_b32_e32 v13, v13, v33
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v9
-; SI-NEXT:    v_or_b32_e32 v16, v16, v33
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v14
-; SI-NEXT:    v_or_b32_e32 v15, v15, v33
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, v34
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v37
-; SI-NEXT:    v_add_f32_e32 v32, 0x38000000, v32
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
-; SI-NEXT:    v_add_f32_e32 v33, 0x38000000, v33
-; SI-NEXT:    v_add_f32_e32 v31, 0x38000000, v31
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v33
-; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v35
-; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
-; SI-NEXT:    v_cvt_f32_f16_e32 v36, v36
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v32
-; SI-NEXT:    v_or_b32_e32 v34, v33, v34
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v31
-; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v35
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
-; SI-NEXT:    v_add_f32_e32 v36, 0x38000000, v36
-; SI-NEXT:    v_or_b32_e32 v37, v37, v33
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, v49
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v36
-; SI-NEXT:    v_cvt_f32_f16_e32 v51, v51
-; SI-NEXT:    v_cvt_f32_f16_e32 v39, v39
-; SI-NEXT:    v_cvt_f32_f16_e32 v52, v52
-; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
+; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
+; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
+; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v22
+; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v28
+; SI-NEXT:    v_add_f32_e32 v26, 0x38000000, v26
+; SI-NEXT:    v_cvt_f16_f32_e32 v26, v26
 ; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT:    v_lshlrev_b32_e32 v49, 16, v35
-; SI-NEXT:    v_add_f32_e32 v33, 0x38000000, v33
-; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
 ; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
-; SI-NEXT:    v_or_b32_e32 v36, v36, v49
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, v33
-; SI-NEXT:    v_add_f32_e32 v33, 0x38000000, v51
-; SI-NEXT:    v_add_f32_e32 v39, 0x38000000, v39
-; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
-; SI-NEXT:    v_cvt_f32_f16_e32 v28, v28
-; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
-; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
-; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
-; SI-NEXT:    v_cvt_f32_f16_e32 v50, v50
-; SI-NEXT:    v_cvt_f32_f16_e32 v38, v38
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v33
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
-; SI-NEXT:    v_add_f32_e32 v52, 0x38000000, v52
-; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, v52
-; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_or_b32_e32 v5, v28, v19
+; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v31
+; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v49
 ; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
-; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
+; SI-NEXT:    v_or_b32_e32 v28, v28, v15
+; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
-; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
 ; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v51, 16, v49
-; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
 ; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
-; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v25
+; SI-NEXT:    v_or_b32_e32 v39, v5, v29
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
+; SI-NEXT:    v_add_f32_e32 v31, 0x38000000, v31
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_or_b32_e32 v5, v31, v25
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v28
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v38
 ; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
-; SI-NEXT:    v_add_f32_e32 v26, 0x38000000, v26
-; SI-NEXT:    v_add_f32_e32 v23, 0x38000000, v23
-; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v22
-; SI-NEXT:    v_add_f32_e32 v50, 0x38000000, v50
-; SI-NEXT:    v_add_f32_e32 v38, 0x38000000, v38
-; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
-; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
-; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
-; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
-; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
-; SI-NEXT:    v_or_b32_e32 v51, v33, v51
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v39
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
 ; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
-; SI-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
-; SI-NEXT:    v_cvt_f16_f32_e32 v38, v38
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; SI-NEXT:    v_or_b32_e32 v52, v52, v33
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v19
-; SI-NEXT:    v_or_b32_e32 v20, v20, v33
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v21
-; SI-NEXT:    v_or_b32_e32 v27, v27, v33
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v56
-; SI-NEXT:    v_lshlrev_b32_e32 v58, 16, v44
-; SI-NEXT:    v_lshlrev_b32_e32 v59, 16, v41
-; SI-NEXT:    v_or_b32_e32 v25, v25, v33
-; SI-NEXT:    v_or_b32_e32 v28, v28, v47
-; SI-NEXT:    v_or_b32_e32 v26, v26, v46
-; SI-NEXT:    v_or_b32_e32 v23, v23, v45
-; SI-NEXT:    v_or_b32_e32 v22, v22, v57
-; SI-NEXT:    v_or_b32_e32 v50, v50, v43
-; SI-NEXT:    v_or_b32_e32 v38, v38, v58
-; SI-NEXT:    v_or_b32_e32 v3, v3, v40
-; SI-NEXT:    v_or_b32_e32 v18, v18, v55
-; SI-NEXT:    v_or_b32_e32 v17, v17, v59
-; SI-NEXT:    v_or_b32_e32 v12, v12, v53
-; SI-NEXT:    v_or_b32_e32 v10, v10, v30
-; SI-NEXT:    v_or_b32_e32 v8, v8, v60
-; SI-NEXT:    v_alignbit_b32 v56, v25, v47, 16
-; SI-NEXT:    v_alignbit_b32 v47, v27, v46, 16
-; SI-NEXT:    v_alignbit_b32 v46, v20, v45, 16
-; SI-NEXT:    v_alignbit_b32 v45, v52, v57, 16
-; SI-NEXT:    v_mov_b32_e32 v57, v54
-; SI-NEXT:    v_alignbit_b32 v44, v51, v43, 16
-; SI-NEXT:    v_alignbit_b32 v43, v36, v42, 16
-; SI-NEXT:    v_alignbit_b32 v42, v37, v58, 16
-; SI-NEXT:    v_alignbit_b32 v41, v34, v40, 16
-; SI-NEXT:    v_alignbit_b32 v40, v15, v55, 16
-; SI-NEXT:    v_alignbit_b32 v55, v16, v59, 16
-; SI-NEXT:    v_alignbit_b32 v54, v13, v53, 16
-; SI-NEXT:    v_alignbit_b32 v53, v6, v30, 16
-; SI-NEXT:    v_alignbit_b32 v30, v7, v60, 16
-; SI-NEXT:    v_alignbit_b32 v29, v4, v29, 16
-; SI-NEXT:  .LBB59_3: ; %end
-; SI-NEXT:    v_and_b32_e32 v28, 0xffff, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v33, 16, v56
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v25
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; SI-NEXT:    v_or_b32_e32 v28, v28, v33
-; SI-NEXT:    v_or_b32_e32 v24, v25, v24
-; SI-NEXT:    v_add_i32_e32 v25, vcc, 4, v0
-; SI-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v24, v25, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v24, 0xffff, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v47
-; SI-NEXT:    v_or_b32_e32 v24, v24, v25
-; SI-NEXT:    v_add_i32_e32 v25, vcc, 8, v0
-; SI-NEXT:    buffer_store_dword v24, v25, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v24, 0xffff, v27
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; SI-NEXT:    v_or_b32_e32 v21, v24, v21
-; SI-NEXT:    v_add_i32_e32 v24, vcc, 12, v0
-; SI-NEXT:    buffer_store_dword v21, v24, s[0:3], 0 offen
+; SI-NEXT:    v_add_f32_e32 v31, 0x38000000, v31
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_or_b32_e32 v28, v28, v21
+; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v23
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v46
-; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; SI-NEXT:    v_or_b32_e32 v21, v21, v23
-; SI-NEXT:    v_add_i32_e32 v23, vcc, 16, v0
-; SI-NEXT:    v_or_b32_e32 v19, v20, v19
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 20, v0
-; SI-NEXT:    buffer_store_dword v21, v23, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v37
+; SI-NEXT:    v_or_b32_e32 v38, v31, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v56
+; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
+; SI-NEXT:    v_add_f32_e32 v31, 0x38000000, v31
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
+; SI-NEXT:    v_or_b32_e32 v37, v28, v11
+; SI-NEXT:    v_or_b32_e32 v62, v31, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v31, v33
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_or_b32_e32 v5, v5, v17
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v45
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 24, v0
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v63
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_or_b32_e32 v5, v5, v9
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v52
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v39
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 28, v0
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v46
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v28, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v44
+; SI-NEXT:    v_or_b32_e32 v35, v28, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v34
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
+; SI-NEXT:    v_or_b32_e32 v34, v1, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v27
+; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v31
+; SI-NEXT:    v_or_b32_e32 v56, v28, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v32
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v27
+; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v28
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v42
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v31
+; SI-NEXT:    v_or_b32_e32 v2, v2, v27
+; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v33, v28
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v32
+; SI-NEXT:    v_or_b32_e32 v4, v4, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v27, v60
+; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v33
+; SI-NEXT:    v_or_b32_e32 v6, v6, v28
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v57
+; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v27
+; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v28
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v36
+; SI-NEXT:    v_cvt_f16_f32_e32 v57, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v60
+; SI-NEXT:    v_or_b32_e32 v8, v8, v27
+; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v63, v28
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v57
+; SI-NEXT:    v_or_b32_e32 v10, v10, v27
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v63
+; SI-NEXT:    v_or_b32_e32 v12, v12, v28
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v28
+; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, v27
+; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v28, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v36
+; SI-NEXT:    v_or_b32_e32 v14, v14, v27
+; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v28
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v50
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v44
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 32, v0
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; SI-NEXT:    v_or_b32_e32 v18, v18, v27
+; SI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshr_b64 v[52:53], v[17:18], 16
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v28
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
+; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, v27
+; SI-NEXT:    buffer_store_dword v28, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v51
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v49
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 36, v0
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; SI-NEXT:    v_or_b32_e32 v22, v22, v28
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v59
+; SI-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    v_lshr_b64 v[54:55], v[21:22], 16
+; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v28
+; SI-NEXT:    v_cvt_f32_f16_e32 v28, v30
+; SI-NEXT:    v_cvt_f16_f32_e32 v59, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v30, v51
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v36
+; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
+; SI-NEXT:    v_or_b32_e32 v26, v26, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v59
+; SI-NEXT:    v_add_f32_e32 v30, 0x38000000, v30
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v48
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v43
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 40, v0
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, v30
+; SI-NEXT:    v_or_b32_e32 v30, v28, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v27, v58
+; SI-NEXT:    v_lshr_b64 v[41:42], v[29:30], 16
+; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v36
+; SI-NEXT:    v_or_b32_e32 v16, v16, v28
+; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, v27
+; SI-NEXT:    v_cvt_f32_f16_e32 v27, v61
+; SI-NEXT:    v_lshr_b64 v[43:44], v[15:16], 16
+; SI-NEXT:    v_mov_b32_e32 v44, v34
+; SI-NEXT:    v_mov_b32_e32 v42, v33
+; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v58
+; SI-NEXT:    v_or_b32_e32 v20, v20, v27
+; SI-NEXT:    v_lshr_b64 v[45:46], v[19:20], 16
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v61
+; SI-NEXT:    v_or_b32_e32 v24, v24, v27
+; SI-NEXT:    v_lshr_b64 v[33:34], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[47:48], v[23:24], 16
+; SI-NEXT:    v_mov_b32_e32 v23, v36
+; SI-NEXT:    v_mov_b32_e32 v46, v35
+; SI-NEXT:    v_lshr_b64 v[35:36], v[7:8], 16
+; SI-NEXT:    v_mov_b32_e32 v7, v63
+; SI-NEXT:    v_mov_b32_e32 v34, v56
+; SI-NEXT:    v_mov_b32_e32 v56, v62
+; SI-NEXT:    v_lshr_b64 v[62:63], v[3:4], 16
+; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    v_mov_b32_e32 v19, v39
+; SI-NEXT:    v_mov_b32_e32 v15, v38
+; SI-NEXT:    v_lshr_b64 v[39:40], v[25:26], 16
+; SI-NEXT:    v_lshr_b64 v[50:51], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[48:49], v[11:12], 16
+; SI-NEXT:    v_mov_b32_e32 v11, v37
+; SI-NEXT:    v_lshr_b64 v[37:38], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[27:28], v[1:2], 16
+; SI-NEXT:  .LBB59_3: ; %end
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v47
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v36
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v35
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 44, v0
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v24
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v61
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 4, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v38
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v42
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 48, v0
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v45
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 8, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v31
-; SI-NEXT:    v_or_b32_e32 v19, v19, v20
-; SI-NEXT:    v_add_i32_e32 v20, vcc, 52, v0
-; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v20
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v58
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v43
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v36
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v23
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v41
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v19
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v30
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v59
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v39
+; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v41
-; SI-NEXT:    v_or_b32_e32 v3, v3, v19
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 56, v0
-; SI-NEXT:    buffer_store_dword v3, v19, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v26
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 36, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v34
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v32
-; SI-NEXT:    v_or_b32_e32 v3, v3, v19
-; SI-NEXT:    v_add_i32_e32 v19, vcc, 60, v0
-; SI-NEXT:    buffer_store_dword v3, v19, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v54
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 40, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v40
-; SI-NEXT:    v_or_b32_e32 v3, v3, v18
-; SI-NEXT:    v_add_i32_e32 v18, vcc, 64, v0
-; SI-NEXT:    buffer_store_dword v3, v18, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v22
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 44, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT:    v_or_b32_e32 v3, v3, v14
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 0x44, v0
-; SI-NEXT:    buffer_store_dword v3, v14, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v52
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v17
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v55
-; SI-NEXT:    v_or_b32_e32 v3, v3, v14
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 0x48, v0
-; SI-NEXT:    buffer_store_dword v3, v14, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v18
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; SI-NEXT:    v_or_b32_e32 v3, v3, v9
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 0x4c, v0
-; SI-NEXT:    buffer_store_dword v3, v9, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v54
-; SI-NEXT:    v_or_b32_e32 v3, v3, v9
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 0x50, v0
-; SI-NEXT:    buffer_store_dword v3, v9, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
-; SI-NEXT:    v_or_b32_e32 v3, v3, v9
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 0x54, v0
-; SI-NEXT:    buffer_store_dword v3, v9, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v53
-; SI-NEXT:    v_or_b32_e32 v3, v3, v9
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 0x58, v0
-; SI-NEXT:    buffer_store_dword v3, v9, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v15
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v50
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x5c, v0
-; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v14
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v30
-; SI-NEXT:    v_or_b32_e32 v3, v3, v5
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x60, v0
-; SI-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v48
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 64, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v7
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v12
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x44, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v63
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v37
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x48, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v57
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v56
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v35
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x50, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v60
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x54, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v46
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v33
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x58, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v42
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x5c, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v44
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v62
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x60, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v32
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v57
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v29
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v27
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
 ; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v31
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
index 7785a9938e1be..967f1a9b442b0 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll
@@ -3847,361 +3847,396 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3
 ; SI-LABEL: bitcast_v30i32_to_v60i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_writelane_b32 v18, s30, 0
+; SI-NEXT:    v_writelane_b32 v18, s31, 1
+; SI-NEXT:    v_writelane_b32 v18, s34, 2
+; SI-NEXT:    v_writelane_b32 v18, s35, 3
+; SI-NEXT:    v_writelane_b32 v18, s36, 4
+; SI-NEXT:    v_writelane_b32 v18, s37, 5
+; SI-NEXT:    v_writelane_b32 v18, s38, 6
+; SI-NEXT:    v_writelane_b32 v18, s39, 7
+; SI-NEXT:    v_writelane_b32 v18, s48, 8
+; SI-NEXT:    v_writelane_b32 v18, s49, 9
+; SI-NEXT:    v_writelane_b32 v18, s50, 10
+; SI-NEXT:    v_writelane_b32 v18, s51, 11
+; SI-NEXT:    v_writelane_b32 v18, s52, 12
+; SI-NEXT:    v_writelane_b32 v18, s53, 13
+; SI-NEXT:    v_writelane_b32 v18, s54, 14
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v17
-; SI-NEXT:    v_readfirstlane_b32 s45, v1
-; SI-NEXT:    v_readfirstlane_b32 s44, v2
-; SI-NEXT:    v_readfirstlane_b32 s43, v3
-; SI-NEXT:    v_readfirstlane_b32 s42, v4
-; SI-NEXT:    v_readfirstlane_b32 s41, v5
-; SI-NEXT:    v_readfirstlane_b32 s40, v6
-; SI-NEXT:    v_readfirstlane_b32 s15, v7
-; SI-NEXT:    v_readfirstlane_b32 s14, v8
-; SI-NEXT:    v_readfirstlane_b32 s13, v9
-; SI-NEXT:    v_readfirstlane_b32 s12, v10
-; SI-NEXT:    v_readfirstlane_b32 s11, v11
-; SI-NEXT:    v_readfirstlane_b32 s10, v12
-; SI-NEXT:    v_readfirstlane_b32 s9, v13
-; SI-NEXT:    v_readfirstlane_b32 s8, v14
-; SI-NEXT:    v_readfirstlane_b32 s7, v15
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_readfirstlane_b32 s6, v16
+; SI-NEXT:    v_writelane_b32 v18, s55, 15
+; SI-NEXT:    v_readfirstlane_b32 s42, v1
+; SI-NEXT:    v_readfirstlane_b32 s43, v2
+; SI-NEXT:    v_readfirstlane_b32 s40, v3
+; SI-NEXT:    v_readfirstlane_b32 s41, v4
+; SI-NEXT:    v_readfirstlane_b32 s14, v5
+; SI-NEXT:    v_readfirstlane_b32 s15, v6
+; SI-NEXT:    v_readfirstlane_b32 s12, v7
+; SI-NEXT:    v_readfirstlane_b32 s13, v8
+; SI-NEXT:    v_readfirstlane_b32 s10, v9
+; SI-NEXT:    v_readfirstlane_b32 s11, v10
+; SI-NEXT:    v_readfirstlane_b32 s8, v11
+; SI-NEXT:    v_readfirstlane_b32 s9, v12
+; SI-NEXT:    v_readfirstlane_b32 s6, v13
+; SI-NEXT:    v_readfirstlane_b32 s7, v14
+; SI-NEXT:    v_readfirstlane_b32 s4, v15
+; SI-NEXT:    s_and_b64 s[44:45], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s5, v16
+; SI-NEXT:    v_writelane_b32 v18, s64, 16
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s13
-; SI-NEXT:    v_mov_b32_e32 v5, s15
-; SI-NEXT:    v_mov_b32_e32 v6, s41
-; SI-NEXT:    v_mov_b32_e32 v7, s43
-; SI-NEXT:    v_mov_b32_e32 v8, s45
-; SI-NEXT:    v_mov_b32_e32 v9, s28
-; SI-NEXT:    v_mov_b32_e32 v10, s26
-; SI-NEXT:    v_mov_b32_e32 v11, s24
-; SI-NEXT:    v_mov_b32_e32 v12, s22
-; SI-NEXT:    v_mov_b32_e32 v13, s20
-; SI-NEXT:    v_mov_b32_e32 v14, s18
-; SI-NEXT:    v_mov_b32_e32 v15, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s10, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s12, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s14, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s40, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s42, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s44, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s29, v9, 16
-; SI-NEXT:    v_alignbit_b32 v10, s27, v10, 16
-; SI-NEXT:    v_alignbit_b32 v11, s25, v11, 16
-; SI-NEXT:    v_alignbit_b32 v12, s23, v12, 16
-; SI-NEXT:    v_alignbit_b32 v13, s21, v13, 16
-; SI-NEXT:    v_alignbit_b32 v14, s19, v14, 16
-; SI-NEXT:    v_alignbit_b32 v15, s17, v15, 16
-; SI-NEXT:    s_lshr_b32 s46, s6, 16
-; SI-NEXT:    s_lshr_b32 s47, s8, 16
-; SI-NEXT:    s_lshr_b32 s56, s10, 16
-; SI-NEXT:    s_lshr_b32 s57, s12, 16
-; SI-NEXT:    s_lshr_b32 s58, s14, 16
-; SI-NEXT:    s_lshr_b32 s59, s40, 16
-; SI-NEXT:    s_lshr_b32 s60, s42, 16
-; SI-NEXT:    s_lshr_b32 s61, s44, 16
-; SI-NEXT:    s_lshr_b32 s62, s29, 16
-; SI-NEXT:    s_lshr_b32 s63, s27, 16
-; SI-NEXT:    s_lshr_b32 s72, s25, 16
-; SI-NEXT:    s_lshr_b32 s73, s23, 16
-; SI-NEXT:    s_lshr_b32 s74, s21, 16
-; SI-NEXT:    s_lshr_b32 s75, s19, 16
-; SI-NEXT:    s_lshr_b32 s76, s17, 16
+; SI-NEXT:    s_lshr_b32 s34, s5, 16
+; SI-NEXT:    s_lshr_b32 s35, s7, 16
+; SI-NEXT:    s_lshr_b32 s36, s9, 16
+; SI-NEXT:    s_lshr_b32 s37, s11, 16
+; SI-NEXT:    s_lshr_b32 s38, s13, 16
+; SI-NEXT:    s_lshr_b32 s39, s15, 16
+; SI-NEXT:    s_lshr_b32 s48, s41, 16
+; SI-NEXT:    s_lshr_b32 s49, s43, 16
+; SI-NEXT:    s_lshr_b32 s50, s29, 16
+; SI-NEXT:    s_lshr_b32 s51, s27, 16
+; SI-NEXT:    s_lshr_b32 s52, s25, 16
+; SI-NEXT:    s_lshr_b32 s53, s23, 16
+; SI-NEXT:    s_lshr_b32 s54, s21, 16
+; SI-NEXT:    s_lshr_b32 s55, s19, 16
+; SI-NEXT:    s_lshr_b32 s64, s17, 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[14:15], 16
+; SI-NEXT:    s_lshr_b64 s[72:73], s[40:41], 16
+; SI-NEXT:    s_lshr_b64 s[74:75], s[42:43], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[78:79], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[88:89], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[90:91], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[92:93], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[94:95], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[30:31], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB13_3
 ; SI-NEXT:  .LBB13_2: ; %cmp.true
+; SI-NEXT:    s_add_i32 s17, s17, 3
 ; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    s_add_i32 s19, s19, 3
 ; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_add_i32 s21, s21, 3
 ; SI-NEXT:    s_add_i32 s20, s20, 3
+; SI-NEXT:    s_add_i32 s23, s23, 3
 ; SI-NEXT:    s_add_i32 s22, s22, 3
+; SI-NEXT:    s_add_i32 s25, s25, 3
 ; SI-NEXT:    s_add_i32 s24, s24, 3
+; SI-NEXT:    s_add_i32 s27, s27, 3
 ; SI-NEXT:    s_add_i32 s26, s26, 3
+; SI-NEXT:    s_add_i32 s29, s29, 3
 ; SI-NEXT:    s_add_i32 s28, s28, 3
-; SI-NEXT:    s_add_i32 s45, s45, 3
 ; SI-NEXT:    s_add_i32 s43, s43, 3
-; SI-NEXT:    s_add_i32 s41, s41, 3
-; SI-NEXT:    s_add_i32 s15, s15, 3
-; SI-NEXT:    s_add_i32 s13, s13, 3
-; SI-NEXT:    s_add_i32 s11, s11, 3
-; SI-NEXT:    s_add_i32 s9, s9, 3
-; SI-NEXT:    s_add_i32 s7, s7, 3
-; SI-NEXT:    s_add_i32 s17, s17, 3
-; SI-NEXT:    s_add_i32 s19, s19, 3
-; SI-NEXT:    s_add_i32 s21, s21, 3
-; SI-NEXT:    s_add_i32 s23, s23, 3
-; SI-NEXT:    s_add_i32 s25, s25, 3
-; SI-NEXT:    s_add_i32 s27, s27, 3
-; SI-NEXT:    s_add_i32 s29, s29, 3
-; SI-NEXT:    s_add_i32 s44, s44, 3
 ; SI-NEXT:    s_add_i32 s42, s42, 3
+; SI-NEXT:    s_add_i32 s41, s41, 3
 ; SI-NEXT:    s_add_i32 s40, s40, 3
+; SI-NEXT:    s_add_i32 s15, s15, 3
 ; SI-NEXT:    s_add_i32 s14, s14, 3
+; SI-NEXT:    s_add_i32 s13, s13, 3
 ; SI-NEXT:    s_add_i32 s12, s12, 3
+; SI-NEXT:    s_add_i32 s11, s11, 3
 ; SI-NEXT:    s_add_i32 s10, s10, 3
+; SI-NEXT:    s_add_i32 s9, s9, 3
 ; SI-NEXT:    s_add_i32 s8, s8, 3
+; SI-NEXT:    s_add_i32 s7, s7, 3
 ; SI-NEXT:    s_add_i32 s6, s6, 3
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s13
-; SI-NEXT:    v_mov_b32_e32 v5, s15
-; SI-NEXT:    v_mov_b32_e32 v6, s41
-; SI-NEXT:    v_mov_b32_e32 v7, s43
-; SI-NEXT:    v_mov_b32_e32 v8, s45
-; SI-NEXT:    v_mov_b32_e32 v9, s28
-; SI-NEXT:    v_mov_b32_e32 v10, s26
-; SI-NEXT:    v_mov_b32_e32 v11, s24
-; SI-NEXT:    v_mov_b32_e32 v12, s22
-; SI-NEXT:    v_mov_b32_e32 v13, s20
-; SI-NEXT:    v_mov_b32_e32 v14, s18
-; SI-NEXT:    v_mov_b32_e32 v15, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s10, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s12, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s14, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s40, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s42, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s44, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s29, v9, 16
-; SI-NEXT:    v_alignbit_b32 v10, s27, v10, 16
-; SI-NEXT:    v_alignbit_b32 v11, s25, v11, 16
-; SI-NEXT:    v_alignbit_b32 v12, s23, v12, 16
-; SI-NEXT:    v_alignbit_b32 v13, s21, v13, 16
-; SI-NEXT:    v_alignbit_b32 v14, s19, v14, 16
-; SI-NEXT:    v_alignbit_b32 v15, s17, v15, 16
-; SI-NEXT:    s_lshr_b32 s46, s6, 16
-; SI-NEXT:    s_lshr_b32 s47, s8, 16
-; SI-NEXT:    s_lshr_b32 s56, s10, 16
-; SI-NEXT:    s_lshr_b32 s57, s12, 16
-; SI-NEXT:    s_lshr_b32 s58, s14, 16
-; SI-NEXT:    s_lshr_b32 s59, s40, 16
-; SI-NEXT:    s_lshr_b32 s60, s42, 16
-; SI-NEXT:    s_lshr_b32 s61, s44, 16
-; SI-NEXT:    s_lshr_b32 s62, s29, 16
-; SI-NEXT:    s_lshr_b32 s63, s27, 16
-; SI-NEXT:    s_lshr_b32 s72, s25, 16
-; SI-NEXT:    s_lshr_b32 s73, s23, 16
-; SI-NEXT:    s_lshr_b32 s74, s21, 16
-; SI-NEXT:    s_lshr_b32 s75, s19, 16
-; SI-NEXT:    s_lshr_b32 s76, s17, 16
+; SI-NEXT:    s_add_i32 s5, s5, 3
+; SI-NEXT:    s_add_i32 s4, s4, 3
+; SI-NEXT:    s_lshr_b64 s[44:45], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[14:15], 16
+; SI-NEXT:    s_lshr_b64 s[72:73], s[40:41], 16
+; SI-NEXT:    s_lshr_b64 s[74:75], s[42:43], 16
+; SI-NEXT:    s_lshr_b32 s34, s5, 16
+; SI-NEXT:    s_lshr_b32 s35, s7, 16
+; SI-NEXT:    s_lshr_b32 s36, s9, 16
+; SI-NEXT:    s_lshr_b32 s37, s11, 16
+; SI-NEXT:    s_lshr_b32 s38, s13, 16
+; SI-NEXT:    s_lshr_b32 s39, s15, 16
+; SI-NEXT:    s_lshr_b32 s48, s41, 16
+; SI-NEXT:    s_lshr_b32 s49, s43, 16
+; SI-NEXT:    s_lshr_b32 s50, s29, 16
+; SI-NEXT:    s_lshr_b32 s51, s27, 16
+; SI-NEXT:    s_lshr_b32 s52, s25, 16
+; SI-NEXT:    s_lshr_b32 s53, s23, 16
+; SI-NEXT:    s_lshr_b32 s54, s21, 16
+; SI-NEXT:    s_lshr_b32 s55, s19, 16
+; SI-NEXT:    s_lshr_b32 s64, s17, 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[78:79], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[88:89], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[90:91], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[92:93], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[94:95], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[30:31], s[16:17], 16
 ; SI-NEXT:  .LBB13_3: ; %end
-; SI-NEXT:    s_and_b32 s4, s16, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; SI-NEXT:    v_or_b32_e32 v15, s4, v15
-; SI-NEXT:    s_and_b32 s4, s17, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s76, 16
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_mov_b32_e32 v16, s4
-; SI-NEXT:    s_and_b32 s4, s18, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v15, vcc, 4, v0
-; SI-NEXT:    v_or_b32_e32 v14, s4, v14
-; SI-NEXT:    s_and_b32 s4, s19, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s75, 16
-; SI-NEXT:    buffer_store_dword v16, v15, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v15, vcc, 8, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v14, v15, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v15, s4
-; SI-NEXT:    s_and_b32 s4, s20, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; SI-NEXT:    s_lshl_b32 s45, s30, 16
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s45
+; SI-NEXT:    v_mov_b32_e32 v1, s16
+; SI-NEXT:    s_and_b32 s16, s17, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s64, 16
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_lshl_b32 s16, s94, 16
+; SI-NEXT:    s_and_b32 s17, s18, 0xffff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_mov_b32_e32 v3, s16
+; SI-NEXT:    s_and_b32 s16, s19, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s55, 16
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v4, s16
+; SI-NEXT:    s_lshl_b32 s16, s92, 16
+; SI-NEXT:    s_and_b32 s17, s20, 0xffff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_mov_b32_e32 v5, s16
+; SI-NEXT:    s_and_b32 s16, s21, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s54, 16
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 12, v0
-; SI-NEXT:    v_or_b32_e32 v13, s4, v13
-; SI-NEXT:    s_and_b32 s4, s21, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s74, 16
-; SI-NEXT:    buffer_store_dword v15, v14, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 16, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v14, s4
-; SI-NEXT:    s_and_b32 s4, s22, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; SI-NEXT:    v_mov_b32_e32 v6, s16
+; SI-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; SI-NEXT:    s_lshl_b32 s16, s90, 16
+; SI-NEXT:    s_and_b32 s17, s22, 0xffff
+; SI-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 20, v0
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s23, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s53, 16
+; SI-NEXT:    buffer_store_dword v6, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 24, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 20, v0
-; SI-NEXT:    v_or_b32_e32 v12, s4, v12
-; SI-NEXT:    s_and_b32 s4, s23, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s73, 16
-; SI-NEXT:    buffer_store_dword v14, v13, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 24, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v12, v13, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v13, s4
-; SI-NEXT:    s_and_b32 s4, s24, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s24, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s88, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 28, v0
-; SI-NEXT:    v_or_b32_e32 v11, s4, v11
-; SI-NEXT:    s_and_b32 s4, s25, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s72, 16
-; SI-NEXT:    buffer_store_dword v13, v12, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 32, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v11, v12, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v12, s4
-; SI-NEXT:    s_and_b32 s4, s26, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s25, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s52, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 36, v0
-; SI-NEXT:    v_or_b32_e32 v10, s4, v10
-; SI-NEXT:    s_and_b32 s4, s27, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s63, 16
-; SI-NEXT:    buffer_store_dword v12, v11, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 40, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v11, s4
-; SI-NEXT:    s_and_b32 s4, s28, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s26, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s78, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 36, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 44, v0
-; SI-NEXT:    v_or_b32_e32 v9, s4, v9
-; SI-NEXT:    s_and_b32 s4, s29, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s62, 16
-; SI-NEXT:    buffer_store_dword v11, v10, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v10, s4
-; SI-NEXT:    s_and_b32 s4, s45, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s27, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s51, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 40, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 52, v0
-; SI-NEXT:    v_or_b32_e32 v8, s4, v8
-; SI-NEXT:    s_and_b32 s4, s44, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s61, 16
-; SI-NEXT:    buffer_store_dword v10, v9, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 56, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v8, v9, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v9, s4
-; SI-NEXT:    s_and_b32 s4, s43, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s28, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s76, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 44, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 60, v0
-; SI-NEXT:    v_or_b32_e32 v7, s4, v7
-; SI-NEXT:    s_and_b32 s4, s42, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s60, 16
-; SI-NEXT:    buffer_store_dword v9, v8, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 64, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v8, s4
-; SI-NEXT:    s_and_b32 s4, s41, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s29, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s50, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 48, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x44, v0
-; SI-NEXT:    v_or_b32_e32 v6, s4, v6
-; SI-NEXT:    s_and_b32 s4, s40, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s59, 16
-; SI-NEXT:    buffer_store_dword v8, v7, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    s_and_b32 s4, s15, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s42, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s74, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 52, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x4c, v0
-; SI-NEXT:    v_or_b32_e32 v5, s4, v5
-; SI-NEXT:    s_and_b32 s4, s14, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s58, 16
-; SI-NEXT:    buffer_store_dword v7, v6, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x50, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v6, s4
-; SI-NEXT:    s_and_b32 s4, s13, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s43, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s49, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 56, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x54, v0
-; SI-NEXT:    v_or_b32_e32 v4, s4, v4
-; SI-NEXT:    s_and_b32 s4, s12, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s57, 16
-; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x58, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v5, s4
-; SI-NEXT:    s_and_b32 s4, s11, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s40, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s72, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 60, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x5c, v0
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s10, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s56, 16
-; SI-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x60, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v4, s4
-; SI-NEXT:    s_and_b32 s4, s9, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s41, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s48, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 64, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
-; SI-NEXT:    v_or_b32_e32 v2, s4, v2
-; SI-NEXT:    s_and_b32 s4, s8, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s47, 16
-; SI-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v3, s4
-; SI-NEXT:    s_and_b32 s4, s7, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s14, s14, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s62, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x44, v0
+; SI-NEXT:    s_or_b32 s14, s14, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
-; SI-NEXT:    v_or_b32_e32 v1, s4, v1
-; SI-NEXT:    s_and_b32 s4, s6, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s46, 16
-; SI-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s14
+; SI-NEXT:    s_and_b32 s14, s15, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s39, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x48, v0
+; SI-NEXT:    s_or_b32 s14, s14, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s14
+; SI-NEXT:    s_and_b32 s12, s12, 0xffff
+; SI-NEXT:    s_lshl_b32 s14, s60, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x4c, v0
+; SI-NEXT:    s_or_b32 s12, s12, s14
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    s_and_b32 s12, s13, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s38, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x50, v0
+; SI-NEXT:    s_or_b32 s12, s12, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_lshl_b32 s12, s58, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x54, v0
+; SI-NEXT:    s_or_b32 s10, s10, s12
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s10, s11, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s37, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x58, v0
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    s_lshl_b32 s10, s56, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x5c, v0
+; SI-NEXT:    s_or_b32 s8, s8, s10
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s8, s9, 0xffff
+; SI-NEXT:    s_lshl_b32 s9, s36, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x60, v0
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_lshl_b32 s8, s46, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x64, v0
+; SI-NEXT:    s_or_b32 s6, s6, s8
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s6, s7, 0xffff
+; SI-NEXT:    s_lshl_b32 s7, s35, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x68, v0
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; SI-NEXT:    s_lshl_b32 s6, s44, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x6c, v0
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    s_and_b32 s4, s5, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s34, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x70, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    v_readlane_b32 s64, v18, 16
+; SI-NEXT:    v_readlane_b32 s55, v18, 15
+; SI-NEXT:    v_readlane_b32 s54, v18, 14
+; SI-NEXT:    v_readlane_b32 s53, v18, 13
+; SI-NEXT:    v_readlane_b32 s52, v18, 12
+; SI-NEXT:    v_readlane_b32 s51, v18, 11
+; SI-NEXT:    v_readlane_b32 s50, v18, 10
+; SI-NEXT:    v_readlane_b32 s49, v18, 9
+; SI-NEXT:    v_readlane_b32 s48, v18, 8
+; SI-NEXT:    v_readlane_b32 s39, v18, 7
+; SI-NEXT:    v_readlane_b32 s38, v18, 6
+; SI-NEXT:    v_readlane_b32 s37, v18, 5
+; SI-NEXT:    v_readlane_b32 s36, v18, 4
+; SI-NEXT:    v_readlane_b32 s35, v18, 3
+; SI-NEXT:    v_readlane_b32 s34, v18, 2
+; SI-NEXT:    v_readlane_b32 s31, v18, 1
+; SI-NEXT:    v_readlane_b32 s30, v18, 0
+; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB13_4:
-; SI-NEXT:    ; implicit-def: $vgpr15
+; SI-NEXT:    ; implicit-def: $sgpr30
+; SI-NEXT:    ; implicit-def: $sgpr64
+; SI-NEXT:    ; implicit-def: $sgpr94
+; SI-NEXT:    ; implicit-def: $sgpr55
+; SI-NEXT:    ; implicit-def: $sgpr92
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; implicit-def: $sgpr90
+; SI-NEXT:    ; implicit-def: $sgpr53
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    ; implicit-def: $sgpr52
+; SI-NEXT:    ; implicit-def: $sgpr78
+; SI-NEXT:    ; implicit-def: $sgpr51
 ; SI-NEXT:    ; implicit-def: $sgpr76
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $sgpr75
-; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $sgpr50
 ; SI-NEXT:    ; implicit-def: $sgpr74
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $sgpr73
-; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $sgpr49
 ; SI-NEXT:    ; implicit-def: $sgpr72
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $sgpr63
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $sgpr48
 ; SI-NEXT:    ; implicit-def: $sgpr62
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $sgpr61
-; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $sgpr39
 ; SI-NEXT:    ; implicit-def: $sgpr60
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $sgpr59
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $sgpr38
 ; SI-NEXT:    ; implicit-def: $sgpr58
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $sgpr57
-; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $sgpr37
 ; SI-NEXT:    ; implicit-def: $sgpr56
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $sgpr47
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr36
 ; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr35
+; SI-NEXT:    ; implicit-def: $sgpr34
+; SI-NEXT:    ; implicit-def: $sgpr44
 ; SI-NEXT:    s_branch .LBB13_2
 ;
 ; VI-LABEL: bitcast_v30i32_to_v60i16_scalar:
@@ -15902,245 +15937,248 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v17
-; SI-NEXT:    v_mov_b32_e32 v30, s16
+; SI-NEXT:    v_mov_b32_e32 v27, s16
 ; SI-NEXT:    v_mov_b32_e32 v28, s17
-; SI-NEXT:    v_mov_b32_e32 v33, s18
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_mov_b32_e32 v32, s19
-; SI-NEXT:    v_mov_b32_e32 v29, s20
-; SI-NEXT:    v_mov_b32_e32 v27, s21
-; SI-NEXT:    v_mov_b32_e32 v25, s22
+; SI-NEXT:    v_mov_b32_e32 v29, s18
+; SI-NEXT:    v_mov_b32_e32 v30, s19
+; SI-NEXT:    v_mov_b32_e32 v25, s20
+; SI-NEXT:    v_mov_b32_e32 v26, s21
+; SI-NEXT:    v_mov_b32_e32 v23, s22
 ; SI-NEXT:    v_mov_b32_e32 v24, s23
-; SI-NEXT:    v_mov_b32_e32 v23, s24
-; SI-NEXT:    v_mov_b32_e32 v21, s25
-; SI-NEXT:    v_mov_b32_e32 v20, s26
-; SI-NEXT:    v_mov_b32_e32 v19, s27
-; SI-NEXT:    v_mov_b32_e32 v18, s28
-; SI-NEXT:    v_mov_b32_e32 v17, s29
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v21, s24
+; SI-NEXT:    v_mov_b32_e32 v22, s25
+; SI-NEXT:    v_mov_b32_e32 v19, s26
+; SI-NEXT:    v_mov_b32_e32 v20, s27
+; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
+; SI-NEXT:    v_mov_b32_e32 v17, s28
+; SI-NEXT:    v_mov_b32_e32 v18, s29
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB29_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_alignbit_b32 v22, v16, v15, 16
-; SI-NEXT:    v_alignbit_b32 v26, v14, v13, 16
-; SI-NEXT:    v_alignbit_b32 v31, v12, v11, 16
-; SI-NEXT:    v_alignbit_b32 v34, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v35, v8, v7, 16
-; SI-NEXT:    v_alignbit_b32 v36, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v38, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v48, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v51, v17, v18, 16
-; SI-NEXT:    v_alignbit_b32 v53, v19, v20, 16
-; SI-NEXT:    v_alignbit_b32 v55, v21, v23, 16
-; SI-NEXT:    v_alignbit_b32 v41, v24, v25, 16
-; SI-NEXT:    v_alignbit_b32 v44, v27, v29, 16
+; SI-NEXT:    v_lshr_b64 v[31:32], v[15:16], 16
+; SI-NEXT:    v_lshr_b64 v[32:33], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[48:49], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[33:34], v[11:12], 16
+; SI-NEXT:    v_lshr_b64 v[49:50], v[19:20], 16
+; SI-NEXT:    v_lshr_b64 v[34:35], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[50:51], v[21:22], 16
+; SI-NEXT:    v_lshr_b64 v[35:36], v[7:8], 16
+; SI-NEXT:    v_lshr_b64 v[51:52], v[23:24], 16
+; SI-NEXT:    v_lshr_b64 v[36:37], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[52:53], v[25:26], 16
+; SI-NEXT:    v_lshr_b64 v[37:38], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[53:54], v[29:30], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v2
 ; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    v_alignbit_b32 v46, v32, v33, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v18
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v20
 ; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_alignbit_b32 v56, v28, v30, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v22
 ; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v24
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v27
+; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v26
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v32
+; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v30
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v28
+; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v28
+; SI-NEXT:    v_lshr_b64 v[38:39], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[54:55], v[27:28], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB29_3
 ; SI-NEXT:  .LBB29_2: ; %cmp.true
-; SI-NEXT:    v_add_f32_e32 v28, 1.0, v28
-; SI-NEXT:    v_add_f32_e32 v30, 1.0, v30
-; SI-NEXT:    v_add_f32_e32 v32, 1.0, v32
-; SI-NEXT:    v_add_f32_e32 v33, 1.0, v33
-; SI-NEXT:    v_add_f32_e32 v27, 1.0, v27
-; SI-NEXT:    v_add_f32_e32 v29, 1.0, v29
-; SI-NEXT:    v_add_f32_e32 v24, 1.0, v24
-; SI-NEXT:    v_add_f32_e32 v25, 1.0, v25
+; SI-NEXT:    v_add_f32_e32 v16, 1.0, v16
+; SI-NEXT:    v_add_f32_e32 v15, 1.0, v15
+; SI-NEXT:    v_add_f32_e32 v18, 1.0, v18
+; SI-NEXT:    v_add_f32_e32 v17, 1.0, v17
+; SI-NEXT:    v_add_f32_e32 v14, 1.0, v14
+; SI-NEXT:    v_add_f32_e32 v13, 1.0, v13
+; SI-NEXT:    v_lshr_b64 v[31:32], v[15:16], 16
+; SI-NEXT:    v_add_f32_e32 v20, 1.0, v20
+; SI-NEXT:    v_add_f32_e32 v19, 1.0, v19
+; SI-NEXT:    v_add_f32_e32 v12, 1.0, v12
+; SI-NEXT:    v_add_f32_e32 v11, 1.0, v11
+; SI-NEXT:    v_lshr_b64 v[32:33], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[48:49], v[17:18], 16
+; SI-NEXT:    v_add_f32_e32 v22, 1.0, v22
 ; SI-NEXT:    v_add_f32_e32 v21, 1.0, v21
+; SI-NEXT:    v_add_f32_e32 v10, 1.0, v10
+; SI-NEXT:    v_add_f32_e32 v9, 1.0, v9
+; SI-NEXT:    v_lshr_b64 v[33:34], v[11:12], 16
+; SI-NEXT:    v_lshr_b64 v[49:50], v[19:20], 16
+; SI-NEXT:    v_add_f32_e32 v24, 1.0, v24
 ; SI-NEXT:    v_add_f32_e32 v23, 1.0, v23
-; SI-NEXT:    v_add_f32_e32 v19, 1.0, v19
-; SI-NEXT:    v_add_f32_e32 v20, 1.0, v20
-; SI-NEXT:    v_add_f32_e32 v17, 1.0, v17
-; SI-NEXT:    v_add_f32_e32 v18, 1.0, v18
-; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_add_f32_e32 v4, 1.0, v4
-; SI-NEXT:    v_add_f32_e32 v3, 1.0, v3
-; SI-NEXT:    v_add_f32_e32 v6, 1.0, v6
-; SI-NEXT:    v_add_f32_e32 v5, 1.0, v5
 ; SI-NEXT:    v_add_f32_e32 v8, 1.0, v8
 ; SI-NEXT:    v_add_f32_e32 v7, 1.0, v7
-; SI-NEXT:    v_add_f32_e32 v10, 1.0, v10
-; SI-NEXT:    v_add_f32_e32 v9, 1.0, v9
-; SI-NEXT:    v_add_f32_e32 v12, 1.0, v12
-; SI-NEXT:    v_add_f32_e32 v11, 1.0, v11
-; SI-NEXT:    v_add_f32_e32 v14, 1.0, v14
-; SI-NEXT:    v_add_f32_e32 v13, 1.0, v13
-; SI-NEXT:    v_add_f32_e32 v16, 1.0, v16
-; SI-NEXT:    v_add_f32_e32 v15, 1.0, v15
-; SI-NEXT:    v_alignbit_b32 v22, v16, v15, 16
-; SI-NEXT:    v_alignbit_b32 v26, v14, v13, 16
-; SI-NEXT:    v_alignbit_b32 v31, v12, v11, 16
-; SI-NEXT:    v_alignbit_b32 v34, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v35, v8, v7, 16
-; SI-NEXT:    v_alignbit_b32 v36, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v38, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v48, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v51, v17, v18, 16
-; SI-NEXT:    v_alignbit_b32 v53, v19, v20, 16
-; SI-NEXT:    v_alignbit_b32 v55, v21, v23, 16
-; SI-NEXT:    v_alignbit_b32 v41, v24, v25, 16
-; SI-NEXT:    v_alignbit_b32 v44, v27, v29, 16
+; SI-NEXT:    v_lshr_b64 v[34:35], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[50:51], v[21:22], 16
+; SI-NEXT:    v_add_f32_e32 v26, 1.0, v26
+; SI-NEXT:    v_add_f32_e32 v25, 1.0, v25
+; SI-NEXT:    v_add_f32_e32 v6, 1.0, v6
+; SI-NEXT:    v_add_f32_e32 v5, 1.0, v5
+; SI-NEXT:    v_lshr_b64 v[35:36], v[7:8], 16
+; SI-NEXT:    v_lshr_b64 v[51:52], v[23:24], 16
+; SI-NEXT:    v_add_f32_e32 v30, 1.0, v30
+; SI-NEXT:    v_add_f32_e32 v29, 1.0, v29
+; SI-NEXT:    v_add_f32_e32 v4, 1.0, v4
+; SI-NEXT:    v_add_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_lshr_b64 v[36:37], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[52:53], v[25:26], 16
+; SI-NEXT:    v_add_f32_e32 v28, 1.0, v28
+; SI-NEXT:    v_add_f32_e32 v27, 1.0, v27
+; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_lshr_b64 v[37:38], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[53:54], v[29:30], 16
+; SI-NEXT:    v_lshr_b64 v[38:39], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[54:55], v[27:28], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v2
 ; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    v_alignbit_b32 v46, v32, v33, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v18
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v20
 ; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_alignbit_b32 v56, v28, v30, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v37, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v50, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v17
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v19
-; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v21
+; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v22
 ; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v24
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v27
+; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v26
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v32
+; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v30
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v28
+; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v28
 ; SI-NEXT:  .LBB29_3: ; %end
-; SI-NEXT:    v_and_b32_e32 v30, 0xffff, v30
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v56
-; SI-NEXT:    v_or_b32_e32 v30, v30, v56
-; SI-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v28, 0xffff, v28
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v60
-; SI-NEXT:    v_or_b32_e32 v28, v28, v30
-; SI-NEXT:    v_add_i32_e32 v30, vcc, 4, v0
-; SI-NEXT:    buffer_store_dword v28, v30, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v28, 0xffff, v33
-; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v46
-; SI-NEXT:    v_or_b32_e32 v28, v28, v30
-; SI-NEXT:    v_add_i32_e32 v30, vcc, 8, v0
-; SI-NEXT:    buffer_store_dword v28, v30, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v54
+; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; SI-NEXT:    v_or_b32_e32 v27, v27, v39
+; SI-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v28, 0xffff, v32
-; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v59
-; SI-NEXT:    v_or_b32_e32 v28, v28, v30
-; SI-NEXT:    v_add_i32_e32 v30, vcc, 12, v0
-; SI-NEXT:    buffer_store_dword v28, v30, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v28
+; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v62
+; SI-NEXT:    v_or_b32_e32 v27, v27, v28
+; SI-NEXT:    v_add_i32_e32 v28, vcc, 4, v0
+; SI-NEXT:    buffer_store_dword v27, v28, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v53
 ; SI-NEXT:    v_and_b32_e32 v28, 0xffff, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v44
-; SI-NEXT:    v_or_b32_e32 v28, v28, v29
-; SI-NEXT:    v_add_i32_e32 v29, vcc, 16, v0
-; SI-NEXT:    buffer_store_dword v28, v29, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v27
+; SI-NEXT:    v_or_b32_e32 v27, v28, v27
+; SI-NEXT:    v_add_i32_e32 v28, vcc, 8, v0
+; SI-NEXT:    buffer_store_dword v27, v28, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v58
+; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v30
+; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v61
 ; SI-NEXT:    v_or_b32_e32 v27, v27, v28
-; SI-NEXT:    v_add_i32_e32 v28, vcc, 20, v0
+; SI-NEXT:    v_add_i32_e32 v28, vcc, 12, v0
 ; SI-NEXT:    buffer_store_dword v27, v28, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v25
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v41
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v52
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v25
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v27
-; SI-NEXT:    v_add_i32_e32 v27, vcc, 24, v0
+; SI-NEXT:    v_add_i32_e32 v27, vcc, 16, v0
 ; SI-NEXT:    buffer_store_dword v25, v27, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v24, 0xffff, v24
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v57
-; SI-NEXT:    v_or_b32_e32 v24, v24, v25
-; SI-NEXT:    v_add_i32_e32 v25, vcc, 28, v0
-; SI-NEXT:    buffer_store_dword v24, v25, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v26
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v60
+; SI-NEXT:    v_or_b32_e32 v25, v25, v26
+; SI-NEXT:    v_add_i32_e32 v26, vcc, 20, v0
+; SI-NEXT:    buffer_store_dword v25, v26, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v51
 ; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
+; SI-NEXT:    v_or_b32_e32 v23, v23, v25
+; SI-NEXT:    v_add_i32_e32 v25, vcc, 24, v0
+; SI-NEXT:    buffer_store_dword v23, v25, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v55
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v24
+; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v59
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v24
-; SI-NEXT:    v_add_i32_e32 v24, vcc, 32, v0
+; SI-NEXT:    v_add_i32_e32 v24, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v23, v24, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v21
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v47
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v50
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v23
-; SI-NEXT:    v_add_i32_e32 v23, vcc, 36, v0
+; SI-NEXT:    v_add_i32_e32 v23, vcc, 32, v0
 ; SI-NEXT:    buffer_store_dword v21, v23, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v20, 0xffff, v20
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v53
-; SI-NEXT:    v_or_b32_e32 v20, v20, v21
-; SI-NEXT:    v_add_i32_e32 v21, vcc, 40, v0
-; SI-NEXT:    buffer_store_dword v20, v21, s[0:3], 0 offen
+; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v58
+; SI-NEXT:    v_or_b32_e32 v21, v21, v22
+; SI-NEXT:    v_add_i32_e32 v22, vcc, 36, v0
+; SI-NEXT:    buffer_store_dword v21, v22, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v45
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v49
+; SI-NEXT:    v_or_b32_e32 v19, v19, v21
+; SI-NEXT:    v_add_i32_e32 v21, vcc, 40, v0
+; SI-NEXT:    buffer_store_dword v19, v21, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v20
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v57
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v20
 ; SI-NEXT:    v_add_i32_e32 v20, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v18, 0xffff, v18
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v51
-; SI-NEXT:    v_or_b32_e32 v18, v18, v19
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v48
+; SI-NEXT:    v_or_b32_e32 v17, v17, v19
 ; SI-NEXT:    v_add_i32_e32 v19, vcc, 48, v0
-; SI-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; SI-NEXT:    buffer_store_dword v17, v19, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v43
+; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v18
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v56
 ; SI-NEXT:    v_or_b32_e32 v17, v17, v18
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v48
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v38
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v17
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, 56, v0
 ; SI-NEXT:    buffer_store_dword v1, v17, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v42
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v47
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v38
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v37
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v46
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -16152,7 +16190,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v54
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v45
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -16164,7 +16202,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v52
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v44
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -16176,92 +16214,94 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v50
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v43
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v31
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v33
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v49
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v42
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v13
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v26
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v32
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v39
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v22
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v31
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v37
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB29_4:
-; SI-NEXT:    ; implicit-def: $vgpr56
+; SI-NEXT:    ; implicit-def: $vgpr54
+; SI-NEXT:    ; implicit-def: $vgpr62
+; SI-NEXT:    ; implicit-def: $vgpr53
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr52
 ; SI-NEXT:    ; implicit-def: $vgpr60
-; SI-NEXT:    ; implicit-def: $vgpr46
+; SI-NEXT:    ; implicit-def: $vgpr51
 ; SI-NEXT:    ; implicit-def: $vgpr59
-; SI-NEXT:    ; implicit-def: $vgpr44
+; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; implicit-def: $vgpr58
-; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; implicit-def: $vgpr57
-; SI-NEXT:    ; implicit-def: $vgpr55
+; SI-NEXT:    ; implicit-def: $vgpr48
+; SI-NEXT:    ; implicit-def: $vgpr56
 ; SI-NEXT:    ; implicit-def: $vgpr47
-; SI-NEXT:    ; implicit-def: $vgpr53
+; SI-NEXT:    ; implicit-def: $vgpr46
 ; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr51
+; SI-NEXT:    ; implicit-def: $vgpr44
 ; SI-NEXT:    ; implicit-def: $vgpr43
-; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr42
-; SI-NEXT:    ; implicit-def: $vgpr38
+; SI-NEXT:    ; implicit-def: $vgpr41
 ; SI-NEXT:    ; implicit-def: $vgpr40
+; SI-NEXT:    ; implicit-def: $vgpr38
+; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr54
 ; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr52
 ; SI-NEXT:    ; implicit-def: $vgpr34
-; SI-NEXT:    ; implicit-def: $vgpr50
+; SI-NEXT:    ; implicit-def: $vgpr33
+; SI-NEXT:    ; implicit-def: $vgpr32
 ; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr49
-; SI-NEXT:    ; implicit-def: $vgpr26
-; SI-NEXT:    ; implicit-def: $vgpr39
-; SI-NEXT:    ; implicit-def: $vgpr22
-; SI-NEXT:    ; implicit-def: $vgpr37
 ; SI-NEXT:    s_branch .LBB29_2
 ;
 ; VI-LABEL: bitcast_v30f32_to_v60i16_scalar:
@@ -27039,361 +27079,396 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3
 ; SI-LABEL: bitcast_v15i64_to_v60i16_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_writelane_b32 v18, s30, 0
+; SI-NEXT:    v_writelane_b32 v18, s31, 1
+; SI-NEXT:    v_writelane_b32 v18, s34, 2
+; SI-NEXT:    v_writelane_b32 v18, s35, 3
+; SI-NEXT:    v_writelane_b32 v18, s36, 4
+; SI-NEXT:    v_writelane_b32 v18, s37, 5
+; SI-NEXT:    v_writelane_b32 v18, s38, 6
+; SI-NEXT:    v_writelane_b32 v18, s39, 7
+; SI-NEXT:    v_writelane_b32 v18, s48, 8
+; SI-NEXT:    v_writelane_b32 v18, s49, 9
+; SI-NEXT:    v_writelane_b32 v18, s50, 10
+; SI-NEXT:    v_writelane_b32 v18, s51, 11
+; SI-NEXT:    v_writelane_b32 v18, s52, 12
+; SI-NEXT:    v_writelane_b32 v18, s53, 13
+; SI-NEXT:    v_writelane_b32 v18, s54, 14
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v17
-; SI-NEXT:    v_readfirstlane_b32 s45, v1
-; SI-NEXT:    v_readfirstlane_b32 s44, v2
-; SI-NEXT:    v_readfirstlane_b32 s43, v3
-; SI-NEXT:    v_readfirstlane_b32 s42, v4
-; SI-NEXT:    v_readfirstlane_b32 s41, v5
-; SI-NEXT:    v_readfirstlane_b32 s40, v6
-; SI-NEXT:    v_readfirstlane_b32 s15, v7
-; SI-NEXT:    v_readfirstlane_b32 s14, v8
-; SI-NEXT:    v_readfirstlane_b32 s13, v9
-; SI-NEXT:    v_readfirstlane_b32 s12, v10
-; SI-NEXT:    v_readfirstlane_b32 s11, v11
-; SI-NEXT:    v_readfirstlane_b32 s10, v12
-; SI-NEXT:    v_readfirstlane_b32 s9, v13
-; SI-NEXT:    v_readfirstlane_b32 s8, v14
-; SI-NEXT:    v_readfirstlane_b32 s7, v15
-; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    v_readfirstlane_b32 s6, v16
+; SI-NEXT:    v_writelane_b32 v18, s55, 15
+; SI-NEXT:    v_readfirstlane_b32 s42, v1
+; SI-NEXT:    v_readfirstlane_b32 s43, v2
+; SI-NEXT:    v_readfirstlane_b32 s40, v3
+; SI-NEXT:    v_readfirstlane_b32 s41, v4
+; SI-NEXT:    v_readfirstlane_b32 s14, v5
+; SI-NEXT:    v_readfirstlane_b32 s15, v6
+; SI-NEXT:    v_readfirstlane_b32 s12, v7
+; SI-NEXT:    v_readfirstlane_b32 s13, v8
+; SI-NEXT:    v_readfirstlane_b32 s10, v9
+; SI-NEXT:    v_readfirstlane_b32 s11, v10
+; SI-NEXT:    v_readfirstlane_b32 s8, v11
+; SI-NEXT:    v_readfirstlane_b32 s9, v12
+; SI-NEXT:    v_readfirstlane_b32 s6, v13
+; SI-NEXT:    v_readfirstlane_b32 s7, v14
+; SI-NEXT:    v_readfirstlane_b32 s4, v15
+; SI-NEXT:    s_and_b64 s[44:45], vcc, exec
+; SI-NEXT:    v_readfirstlane_b32 s5, v16
+; SI-NEXT:    v_writelane_b32 v18, s64, 16
 ; SI-NEXT:    s_cbranch_scc0 .LBB41_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s13
-; SI-NEXT:    v_mov_b32_e32 v5, s15
-; SI-NEXT:    v_mov_b32_e32 v6, s41
-; SI-NEXT:    v_mov_b32_e32 v7, s43
-; SI-NEXT:    v_mov_b32_e32 v8, s45
-; SI-NEXT:    v_mov_b32_e32 v9, s28
-; SI-NEXT:    v_mov_b32_e32 v10, s26
-; SI-NEXT:    v_mov_b32_e32 v11, s24
-; SI-NEXT:    v_mov_b32_e32 v12, s22
-; SI-NEXT:    v_mov_b32_e32 v13, s20
-; SI-NEXT:    v_mov_b32_e32 v14, s18
-; SI-NEXT:    v_mov_b32_e32 v15, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s10, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s12, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s14, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s40, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s42, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s44, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s29, v9, 16
-; SI-NEXT:    v_alignbit_b32 v10, s27, v10, 16
-; SI-NEXT:    v_alignbit_b32 v11, s25, v11, 16
-; SI-NEXT:    v_alignbit_b32 v12, s23, v12, 16
-; SI-NEXT:    v_alignbit_b32 v13, s21, v13, 16
-; SI-NEXT:    v_alignbit_b32 v14, s19, v14, 16
-; SI-NEXT:    v_alignbit_b32 v15, s17, v15, 16
-; SI-NEXT:    s_lshr_b32 s46, s6, 16
-; SI-NEXT:    s_lshr_b32 s47, s8, 16
-; SI-NEXT:    s_lshr_b32 s56, s10, 16
-; SI-NEXT:    s_lshr_b32 s57, s12, 16
-; SI-NEXT:    s_lshr_b32 s58, s14, 16
-; SI-NEXT:    s_lshr_b32 s59, s40, 16
-; SI-NEXT:    s_lshr_b32 s60, s42, 16
-; SI-NEXT:    s_lshr_b32 s61, s44, 16
-; SI-NEXT:    s_lshr_b32 s62, s29, 16
-; SI-NEXT:    s_lshr_b32 s63, s27, 16
-; SI-NEXT:    s_lshr_b32 s72, s25, 16
-; SI-NEXT:    s_lshr_b32 s73, s23, 16
-; SI-NEXT:    s_lshr_b32 s74, s21, 16
-; SI-NEXT:    s_lshr_b32 s75, s19, 16
-; SI-NEXT:    s_lshr_b32 s76, s17, 16
+; SI-NEXT:    s_lshr_b32 s34, s5, 16
+; SI-NEXT:    s_lshr_b32 s35, s7, 16
+; SI-NEXT:    s_lshr_b32 s36, s9, 16
+; SI-NEXT:    s_lshr_b32 s37, s11, 16
+; SI-NEXT:    s_lshr_b32 s38, s13, 16
+; SI-NEXT:    s_lshr_b32 s39, s15, 16
+; SI-NEXT:    s_lshr_b32 s48, s41, 16
+; SI-NEXT:    s_lshr_b32 s49, s43, 16
+; SI-NEXT:    s_lshr_b32 s50, s29, 16
+; SI-NEXT:    s_lshr_b32 s51, s27, 16
+; SI-NEXT:    s_lshr_b32 s52, s25, 16
+; SI-NEXT:    s_lshr_b32 s53, s23, 16
+; SI-NEXT:    s_lshr_b32 s54, s21, 16
+; SI-NEXT:    s_lshr_b32 s55, s19, 16
+; SI-NEXT:    s_lshr_b32 s64, s17, 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[14:15], 16
+; SI-NEXT:    s_lshr_b64 s[72:73], s[40:41], 16
+; SI-NEXT:    s_lshr_b64 s[74:75], s[42:43], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[78:79], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[88:89], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[90:91], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[92:93], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[94:95], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[30:31], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB41_3
 ; SI-NEXT:  .LBB41_2: ; %cmp.true
-; SI-NEXT:    s_add_u32 s16, s16, 3
-; SI-NEXT:    s_addc_u32 s17, s17, 0
-; SI-NEXT:    s_add_u32 s18, s18, 3
-; SI-NEXT:    s_addc_u32 s19, s19, 0
-; SI-NEXT:    s_add_u32 s20, s20, 3
-; SI-NEXT:    s_addc_u32 s21, s21, 0
-; SI-NEXT:    s_add_u32 s22, s22, 3
-; SI-NEXT:    s_addc_u32 s23, s23, 0
-; SI-NEXT:    s_add_u32 s24, s24, 3
-; SI-NEXT:    s_addc_u32 s25, s25, 0
-; SI-NEXT:    s_add_u32 s26, s26, 3
-; SI-NEXT:    s_addc_u32 s27, s27, 0
+; SI-NEXT:    s_add_u32 s4, s4, 3
+; SI-NEXT:    s_addc_u32 s5, s5, 0
+; SI-NEXT:    s_add_u32 s6, s6, 3
+; SI-NEXT:    s_addc_u32 s7, s7, 0
+; SI-NEXT:    s_add_u32 s8, s8, 3
+; SI-NEXT:    s_addc_u32 s9, s9, 0
+; SI-NEXT:    s_add_u32 s10, s10, 3
+; SI-NEXT:    s_addc_u32 s11, s11, 0
+; SI-NEXT:    s_add_u32 s12, s12, 3
+; SI-NEXT:    s_addc_u32 s13, s13, 0
+; SI-NEXT:    s_add_u32 s14, s14, 3
+; SI-NEXT:    s_addc_u32 s15, s15, 0
+; SI-NEXT:    s_add_u32 s40, s40, 3
+; SI-NEXT:    s_addc_u32 s41, s41, 0
+; SI-NEXT:    s_add_u32 s42, s42, 3
+; SI-NEXT:    s_addc_u32 s43, s43, 0
 ; SI-NEXT:    s_add_u32 s28, s28, 3
 ; SI-NEXT:    s_addc_u32 s29, s29, 0
-; SI-NEXT:    s_add_u32 s45, s45, 3
-; SI-NEXT:    s_addc_u32 s44, s44, 0
-; SI-NEXT:    s_add_u32 s43, s43, 3
-; SI-NEXT:    s_addc_u32 s42, s42, 0
-; SI-NEXT:    s_add_u32 s41, s41, 3
-; SI-NEXT:    s_addc_u32 s40, s40, 0
-; SI-NEXT:    s_add_u32 s15, s15, 3
-; SI-NEXT:    s_addc_u32 s14, s14, 0
-; SI-NEXT:    s_add_u32 s13, s13, 3
-; SI-NEXT:    s_addc_u32 s12, s12, 0
-; SI-NEXT:    s_add_u32 s11, s11, 3
-; SI-NEXT:    s_addc_u32 s10, s10, 0
-; SI-NEXT:    s_add_u32 s9, s9, 3
-; SI-NEXT:    s_addc_u32 s8, s8, 0
-; SI-NEXT:    s_add_u32 s7, s7, 3
-; SI-NEXT:    s_addc_u32 s6, s6, 0
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_mov_b32_e32 v2, s9
-; SI-NEXT:    v_mov_b32_e32 v3, s11
-; SI-NEXT:    v_mov_b32_e32 v4, s13
-; SI-NEXT:    v_mov_b32_e32 v5, s15
-; SI-NEXT:    v_mov_b32_e32 v6, s41
-; SI-NEXT:    v_mov_b32_e32 v7, s43
-; SI-NEXT:    v_mov_b32_e32 v8, s45
-; SI-NEXT:    v_mov_b32_e32 v9, s28
-; SI-NEXT:    v_mov_b32_e32 v10, s26
-; SI-NEXT:    v_mov_b32_e32 v11, s24
-; SI-NEXT:    v_mov_b32_e32 v12, s22
-; SI-NEXT:    v_mov_b32_e32 v13, s20
-; SI-NEXT:    v_mov_b32_e32 v14, s18
-; SI-NEXT:    v_mov_b32_e32 v15, s16
-; SI-NEXT:    v_alignbit_b32 v1, s6, v1, 16
-; SI-NEXT:    v_alignbit_b32 v2, s8, v2, 16
-; SI-NEXT:    v_alignbit_b32 v3, s10, v3, 16
-; SI-NEXT:    v_alignbit_b32 v4, s12, v4, 16
-; SI-NEXT:    v_alignbit_b32 v5, s14, v5, 16
-; SI-NEXT:    v_alignbit_b32 v6, s40, v6, 16
-; SI-NEXT:    v_alignbit_b32 v7, s42, v7, 16
-; SI-NEXT:    v_alignbit_b32 v8, s44, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s29, v9, 16
-; SI-NEXT:    v_alignbit_b32 v10, s27, v10, 16
-; SI-NEXT:    v_alignbit_b32 v11, s25, v11, 16
-; SI-NEXT:    v_alignbit_b32 v12, s23, v12, 16
-; SI-NEXT:    v_alignbit_b32 v13, s21, v13, 16
-; SI-NEXT:    v_alignbit_b32 v14, s19, v14, 16
-; SI-NEXT:    v_alignbit_b32 v15, s17, v15, 16
-; SI-NEXT:    s_lshr_b32 s46, s6, 16
-; SI-NEXT:    s_lshr_b32 s47, s8, 16
-; SI-NEXT:    s_lshr_b32 s56, s10, 16
-; SI-NEXT:    s_lshr_b32 s57, s12, 16
-; SI-NEXT:    s_lshr_b32 s58, s14, 16
-; SI-NEXT:    s_lshr_b32 s59, s40, 16
-; SI-NEXT:    s_lshr_b32 s60, s42, 16
-; SI-NEXT:    s_lshr_b32 s61, s44, 16
-; SI-NEXT:    s_lshr_b32 s62, s29, 16
-; SI-NEXT:    s_lshr_b32 s63, s27, 16
-; SI-NEXT:    s_lshr_b32 s72, s25, 16
-; SI-NEXT:    s_lshr_b32 s73, s23, 16
-; SI-NEXT:    s_lshr_b32 s74, s21, 16
-; SI-NEXT:    s_lshr_b32 s75, s19, 16
-; SI-NEXT:    s_lshr_b32 s76, s17, 16
+; SI-NEXT:    s_add_u32 s26, s26, 3
+; SI-NEXT:    s_addc_u32 s27, s27, 0
+; SI-NEXT:    s_add_u32 s24, s24, 3
+; SI-NEXT:    s_addc_u32 s25, s25, 0
+; SI-NEXT:    s_add_u32 s22, s22, 3
+; SI-NEXT:    s_addc_u32 s23, s23, 0
+; SI-NEXT:    s_add_u32 s20, s20, 3
+; SI-NEXT:    s_addc_u32 s21, s21, 0
+; SI-NEXT:    s_add_u32 s18, s18, 3
+; SI-NEXT:    s_addc_u32 s19, s19, 0
+; SI-NEXT:    s_add_u32 s16, s16, 3
+; SI-NEXT:    s_addc_u32 s17, s17, 0
+; SI-NEXT:    s_lshr_b32 s34, s5, 16
+; SI-NEXT:    s_lshr_b32 s35, s7, 16
+; SI-NEXT:    s_lshr_b32 s36, s9, 16
+; SI-NEXT:    s_lshr_b32 s37, s11, 16
+; SI-NEXT:    s_lshr_b32 s38, s13, 16
+; SI-NEXT:    s_lshr_b32 s39, s15, 16
+; SI-NEXT:    s_lshr_b32 s48, s41, 16
+; SI-NEXT:    s_lshr_b32 s49, s43, 16
+; SI-NEXT:    s_lshr_b32 s50, s29, 16
+; SI-NEXT:    s_lshr_b32 s51, s27, 16
+; SI-NEXT:    s_lshr_b32 s52, s25, 16
+; SI-NEXT:    s_lshr_b32 s53, s23, 16
+; SI-NEXT:    s_lshr_b32 s54, s21, 16
+; SI-NEXT:    s_lshr_b32 s55, s19, 16
+; SI-NEXT:    s_lshr_b32 s64, s17, 16
+; SI-NEXT:    s_lshr_b64 s[44:45], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[46:47], s[6:7], 16
+; SI-NEXT:    s_lshr_b64 s[56:57], s[8:9], 16
+; SI-NEXT:    s_lshr_b64 s[58:59], s[10:11], 16
+; SI-NEXT:    s_lshr_b64 s[60:61], s[12:13], 16
+; SI-NEXT:    s_lshr_b64 s[62:63], s[14:15], 16
+; SI-NEXT:    s_lshr_b64 s[72:73], s[40:41], 16
+; SI-NEXT:    s_lshr_b64 s[74:75], s[42:43], 16
+; SI-NEXT:    s_lshr_b64 s[76:77], s[28:29], 16
+; SI-NEXT:    s_lshr_b64 s[78:79], s[26:27], 16
+; SI-NEXT:    s_lshr_b64 s[88:89], s[24:25], 16
+; SI-NEXT:    s_lshr_b64 s[90:91], s[22:23], 16
+; SI-NEXT:    s_lshr_b64 s[92:93], s[20:21], 16
+; SI-NEXT:    s_lshr_b64 s[94:95], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[30:31], s[16:17], 16
 ; SI-NEXT:  .LBB41_3: ; %end
-; SI-NEXT:    s_and_b32 s4, s16, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; SI-NEXT:    v_or_b32_e32 v15, s4, v15
-; SI-NEXT:    s_and_b32 s4, s17, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s76, 16
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    v_mov_b32_e32 v16, s4
-; SI-NEXT:    s_and_b32 s4, s18, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v15, vcc, 4, v0
-; SI-NEXT:    v_or_b32_e32 v14, s4, v14
-; SI-NEXT:    s_and_b32 s4, s19, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s75, 16
-; SI-NEXT:    buffer_store_dword v16, v15, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v15, vcc, 8, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v14, v15, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v15, s4
-; SI-NEXT:    s_and_b32 s4, s20, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; SI-NEXT:    s_lshl_b32 s45, s30, 16
+; SI-NEXT:    s_and_b32 s16, s16, 0xffff
+; SI-NEXT:    s_or_b32 s16, s16, s45
+; SI-NEXT:    v_mov_b32_e32 v1, s16
+; SI-NEXT:    s_and_b32 s16, s17, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s64, 16
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_lshl_b32 s16, s94, 16
+; SI-NEXT:    s_and_b32 s17, s18, 0xffff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_mov_b32_e32 v3, s16
+; SI-NEXT:    s_and_b32 s16, s19, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s55, 16
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    v_mov_b32_e32 v4, s16
+; SI-NEXT:    s_lshl_b32 s16, s92, 16
+; SI-NEXT:    s_and_b32 s17, s20, 0xffff
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    v_mov_b32_e32 v5, s16
+; SI-NEXT:    s_and_b32 s16, s21, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s54, 16
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 12, v0
-; SI-NEXT:    v_or_b32_e32 v13, s4, v13
-; SI-NEXT:    s_and_b32 s4, s21, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s74, 16
-; SI-NEXT:    buffer_store_dword v15, v14, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v14, vcc, 16, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v13, v14, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v14, s4
-; SI-NEXT:    s_and_b32 s4, s22, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; SI-NEXT:    v_mov_b32_e32 v6, s16
+; SI-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; SI-NEXT:    s_lshl_b32 s16, s90, 16
+; SI-NEXT:    s_and_b32 s17, s22, 0xffff
+; SI-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; SI-NEXT:    s_or_b32 s16, s17, s16
+; SI-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 20, v0
+; SI-NEXT:    s_waitcnt expcnt(3)
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s23, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s53, 16
+; SI-NEXT:    buffer_store_dword v6, v1, s[0:3], 0 offen
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 24, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 20, v0
-; SI-NEXT:    v_or_b32_e32 v12, s4, v12
-; SI-NEXT:    s_and_b32 s4, s23, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s73, 16
-; SI-NEXT:    buffer_store_dword v14, v13, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v13, vcc, 24, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v12, v13, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v13, s4
-; SI-NEXT:    s_and_b32 s4, s24, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s24, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s88, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 28, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 28, v0
-; SI-NEXT:    v_or_b32_e32 v11, s4, v11
-; SI-NEXT:    s_and_b32 s4, s25, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s72, 16
-; SI-NEXT:    buffer_store_dword v13, v12, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v12, vcc, 32, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v11, v12, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v12, s4
-; SI-NEXT:    s_and_b32 s4, s26, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s25, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s52, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 36, v0
-; SI-NEXT:    v_or_b32_e32 v10, s4, v10
-; SI-NEXT:    s_and_b32 s4, s27, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s63, 16
-; SI-NEXT:    buffer_store_dword v12, v11, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v11, vcc, 40, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v11, s4
-; SI-NEXT:    s_and_b32 s4, s28, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s26, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s78, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 36, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 44, v0
-; SI-NEXT:    v_or_b32_e32 v9, s4, v9
-; SI-NEXT:    s_and_b32 s4, s29, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s62, 16
-; SI-NEXT:    buffer_store_dword v11, v10, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v10, vcc, 48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v9, v10, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v10, s4
-; SI-NEXT:    s_and_b32 s4, s45, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s27, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s51, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 40, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 52, v0
-; SI-NEXT:    v_or_b32_e32 v8, s4, v8
-; SI-NEXT:    s_and_b32 s4, s44, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s61, 16
-; SI-NEXT:    buffer_store_dword v10, v9, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v9, vcc, 56, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v8, v9, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v9, s4
-; SI-NEXT:    s_and_b32 s4, s43, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s28, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s76, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 44, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 60, v0
-; SI-NEXT:    v_or_b32_e32 v7, s4, v7
-; SI-NEXT:    s_and_b32 s4, s42, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s60, 16
-; SI-NEXT:    buffer_store_dword v9, v8, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 64, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v7, v8, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v8, s4
-; SI-NEXT:    s_and_b32 s4, s41, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s29, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s50, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 48, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x44, v0
-; SI-NEXT:    v_or_b32_e32 v6, s4, v6
-; SI-NEXT:    s_and_b32 s4, s40, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s59, 16
-; SI-NEXT:    buffer_store_dword v8, v7, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v7, vcc, 0x48, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    s_and_b32 s4, s15, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s42, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s74, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 52, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x4c, v0
-; SI-NEXT:    v_or_b32_e32 v5, s4, v5
-; SI-NEXT:    s_and_b32 s4, s14, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s58, 16
-; SI-NEXT:    buffer_store_dword v7, v6, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x50, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v6, s4
-; SI-NEXT:    s_and_b32 s4, s13, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s43, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s49, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 56, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x54, v0
-; SI-NEXT:    v_or_b32_e32 v4, s4, v4
-; SI-NEXT:    s_and_b32 s4, s12, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s57, 16
-; SI-NEXT:    buffer_store_dword v6, v5, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v5, vcc, 0x58, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v5, s4
-; SI-NEXT:    s_and_b32 s4, s11, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s40, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s72, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 60, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x5c, v0
-; SI-NEXT:    v_or_b32_e32 v3, s4, v3
-; SI-NEXT:    s_and_b32 s4, s10, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s56, 16
-; SI-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x60, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v4, s4
-; SI-NEXT:    s_and_b32 s4, s9, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s16, s41, 0xffff
+; SI-NEXT:    s_lshl_b32 s17, s48, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 64, v0
+; SI-NEXT:    s_or_b32 s16, s16, s17
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
-; SI-NEXT:    v_or_b32_e32 v2, s4, v2
-; SI-NEXT:    s_and_b32 s4, s8, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s47, 16
-; SI-NEXT:    buffer_store_dword v4, v3, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
-; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; SI-NEXT:    v_mov_b32_e32 v3, s4
-; SI-NEXT:    s_and_b32 s4, s7, 0xffff
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_mov_b32_e32 v2, s16
+; SI-NEXT:    s_and_b32 s14, s14, 0xffff
+; SI-NEXT:    s_lshl_b32 s16, s62, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x44, v0
+; SI-NEXT:    s_or_b32 s14, s14, s16
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
-; SI-NEXT:    v_or_b32_e32 v1, s4, v1
-; SI-NEXT:    s_and_b32 s4, s6, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s46, 16
-; SI-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
-; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
+; SI-NEXT:    v_mov_b32_e32 v2, s14
+; SI-NEXT:    s_and_b32 s14, s15, 0xffff
+; SI-NEXT:    s_lshl_b32 s15, s39, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x48, v0
+; SI-NEXT:    s_or_b32 s14, s14, s15
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s14
+; SI-NEXT:    s_and_b32 s12, s12, 0xffff
+; SI-NEXT:    s_lshl_b32 s14, s60, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x4c, v0
+; SI-NEXT:    s_or_b32 s12, s12, s14
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    s_and_b32 s12, s13, 0xffff
+; SI-NEXT:    s_lshl_b32 s13, s38, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x50, v0
+; SI-NEXT:    s_or_b32 s12, s12, s13
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s12
+; SI-NEXT:    s_and_b32 s10, s10, 0xffff
+; SI-NEXT:    s_lshl_b32 s12, s58, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x54, v0
+; SI-NEXT:    s_or_b32 s10, s10, s12
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s10, s11, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s37, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x58, v0
+; SI-NEXT:    s_or_b32 s10, s10, s11
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    s_and_b32 s8, s8, 0xffff
+; SI-NEXT:    s_lshl_b32 s10, s56, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x5c, v0
+; SI-NEXT:    s_or_b32 s8, s8, s10
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s8, s9, 0xffff
+; SI-NEXT:    s_lshl_b32 s9, s36, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x60, v0
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_lshl_b32 s8, s46, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x64, v0
+; SI-NEXT:    s_or_b32 s6, s6, s8
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s6, s7, 0xffff
+; SI-NEXT:    s_lshl_b32 s7, s35, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x68, v0
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    s_and_b32 s4, s4, 0xffff
+; SI-NEXT:    s_lshl_b32 s6, s44, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x6c, v0
+; SI-NEXT:    s_or_b32 s4, s4, s6
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v2, s4
+; SI-NEXT:    s_and_b32 s4, s5, 0xffff
+; SI-NEXT:    s_lshl_b32 s5, s34, 16
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x70, v0
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; SI-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s4
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    v_readlane_b32 s64, v18, 16
+; SI-NEXT:    v_readlane_b32 s55, v18, 15
+; SI-NEXT:    v_readlane_b32 s54, v18, 14
+; SI-NEXT:    v_readlane_b32 s53, v18, 13
+; SI-NEXT:    v_readlane_b32 s52, v18, 12
+; SI-NEXT:    v_readlane_b32 s51, v18, 11
+; SI-NEXT:    v_readlane_b32 s50, v18, 10
+; SI-NEXT:    v_readlane_b32 s49, v18, 9
+; SI-NEXT:    v_readlane_b32 s48, v18, 8
+; SI-NEXT:    v_readlane_b32 s39, v18, 7
+; SI-NEXT:    v_readlane_b32 s38, v18, 6
+; SI-NEXT:    v_readlane_b32 s37, v18, 5
+; SI-NEXT:    v_readlane_b32 s36, v18, 4
+; SI-NEXT:    v_readlane_b32 s35, v18, 3
+; SI-NEXT:    v_readlane_b32 s34, v18, 2
+; SI-NEXT:    v_readlane_b32 s31, v18, 1
+; SI-NEXT:    v_readlane_b32 s30, v18, 0
+; SI-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; SI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    s_mov_b64 exec, s[4:5]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB41_4:
-; SI-NEXT:    ; implicit-def: $vgpr15
+; SI-NEXT:    ; implicit-def: $sgpr30
+; SI-NEXT:    ; implicit-def: $sgpr64
+; SI-NEXT:    ; implicit-def: $sgpr94
+; SI-NEXT:    ; implicit-def: $sgpr55
+; SI-NEXT:    ; implicit-def: $sgpr92
+; SI-NEXT:    ; implicit-def: $sgpr54
+; SI-NEXT:    ; implicit-def: $sgpr90
+; SI-NEXT:    ; implicit-def: $sgpr53
+; SI-NEXT:    ; implicit-def: $sgpr88
+; SI-NEXT:    ; implicit-def: $sgpr52
+; SI-NEXT:    ; implicit-def: $sgpr78
+; SI-NEXT:    ; implicit-def: $sgpr51
 ; SI-NEXT:    ; implicit-def: $sgpr76
-; SI-NEXT:    ; implicit-def: $vgpr14
-; SI-NEXT:    ; implicit-def: $sgpr75
-; SI-NEXT:    ; implicit-def: $vgpr13
+; SI-NEXT:    ; implicit-def: $sgpr50
 ; SI-NEXT:    ; implicit-def: $sgpr74
-; SI-NEXT:    ; implicit-def: $vgpr12
-; SI-NEXT:    ; implicit-def: $sgpr73
-; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $sgpr49
 ; SI-NEXT:    ; implicit-def: $sgpr72
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $sgpr63
-; SI-NEXT:    ; implicit-def: $vgpr9
+; SI-NEXT:    ; implicit-def: $sgpr48
 ; SI-NEXT:    ; implicit-def: $sgpr62
-; SI-NEXT:    ; implicit-def: $vgpr8
-; SI-NEXT:    ; implicit-def: $sgpr61
-; SI-NEXT:    ; implicit-def: $vgpr7
+; SI-NEXT:    ; implicit-def: $sgpr39
 ; SI-NEXT:    ; implicit-def: $sgpr60
-; SI-NEXT:    ; implicit-def: $vgpr6
-; SI-NEXT:    ; implicit-def: $sgpr59
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $sgpr38
 ; SI-NEXT:    ; implicit-def: $sgpr58
-; SI-NEXT:    ; implicit-def: $vgpr4
-; SI-NEXT:    ; implicit-def: $sgpr57
-; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $sgpr37
 ; SI-NEXT:    ; implicit-def: $sgpr56
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $sgpr47
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $sgpr36
 ; SI-NEXT:    ; implicit-def: $sgpr46
+; SI-NEXT:    ; implicit-def: $sgpr35
+; SI-NEXT:    ; implicit-def: $sgpr34
+; SI-NEXT:    ; implicit-def: $sgpr44
 ; SI-NEXT:    s_branch .LBB41_2
 ;
 ; VI-LABEL: bitcast_v15i64_to_v60i16_scalar:
@@ -37262,203 +37337,207 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    v_mov_b32_e32 v17, s28
 ; SI-NEXT:    v_mov_b32_e32 v18, s29
-; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_cbranch_scc0 .LBB49_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_alignbit_b32 v31, v16, v15, 16
-; SI-NEXT:    v_alignbit_b32 v32, v14, v13, 16
-; SI-NEXT:    v_alignbit_b32 v33, v12, v11, 16
-; SI-NEXT:    v_alignbit_b32 v34, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v35, v8, v7, 16
-; SI-NEXT:    v_alignbit_b32 v36, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v37, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v39, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v50, v18, v17, 16
-; SI-NEXT:    v_alignbit_b32 v52, v20, v19, 16
-; SI-NEXT:    v_alignbit_b32 v55, v22, v21, 16
-; SI-NEXT:    v_alignbit_b32 v41, v24, v23, 16
-; SI-NEXT:    v_alignbit_b32 v43, v26, v25, 16
+; SI-NEXT:    v_lshr_b64 v[31:32], v[15:16], 16
+; SI-NEXT:    v_lshr_b64 v[32:33], v[13:14], 16
+; SI-NEXT:    v_lshr_b64 v[33:34], v[11:12], 16
+; SI-NEXT:    v_lshr_b64 v[34:35], v[9:10], 16
+; SI-NEXT:    v_lshr_b64 v[35:36], v[7:8], 16
+; SI-NEXT:    v_lshr_b64 v[36:37], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[51:52], v[1:2], 16
+; SI-NEXT:    v_lshr_b64 v[37:38], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[52:53], v[23:24], 16
+; SI-NEXT:    v_lshr_b64 v[38:39], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[48:49], v[19:20], 16
+; SI-NEXT:    v_lshr_b64 v[53:54], v[25:26], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v2
 ; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    v_alignbit_b32 v46, v30, v29, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v18
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v20
 ; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_alignbit_b32 v56, v28, v27, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v22
 ; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v24
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v26
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v30
+; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v30
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v28
+; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v28
+; SI-NEXT:    v_lshr_b64 v[49:50], v[21:22], 16
+; SI-NEXT:    v_lshr_b64 v[54:55], v[29:30], 16
+; SI-NEXT:    v_lshr_b64 v[39:40], v[27:28], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB49_3
 ; SI-NEXT:  .LBB49_2: ; %cmp.true
-; SI-NEXT:    v_add_f64 v[27:28], v[27:28], 1.0
-; SI-NEXT:    v_add_f64 v[29:30], v[29:30], 1.0
-; SI-NEXT:    v_add_f64 v[25:26], v[25:26], 1.0
-; SI-NEXT:    v_add_f64 v[23:24], v[23:24], 1.0
-; SI-NEXT:    v_add_f64 v[21:22], v[21:22], 1.0
-; SI-NEXT:    v_add_f64 v[19:20], v[19:20], 1.0
-; SI-NEXT:    v_add_f64 v[17:18], v[17:18], 1.0
-; SI-NEXT:    v_add_f64 v[1:2], v[1:2], 1.0
-; SI-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
-; SI-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
-; SI-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
-; SI-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
-; SI-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
 ; SI-NEXT:    v_add_f64 v[15:16], v[15:16], 1.0
 ; SI-NEXT:    v_add_f64 v[13:14], v[13:14], 1.0
-; SI-NEXT:    v_alignbit_b32 v31, v16, v15, 16
-; SI-NEXT:    v_alignbit_b32 v32, v14, v13, 16
-; SI-NEXT:    v_alignbit_b32 v33, v12, v11, 16
-; SI-NEXT:    v_alignbit_b32 v34, v10, v9, 16
-; SI-NEXT:    v_alignbit_b32 v35, v8, v7, 16
-; SI-NEXT:    v_alignbit_b32 v36, v6, v5, 16
-; SI-NEXT:    v_alignbit_b32 v37, v4, v3, 16
-; SI-NEXT:    v_alignbit_b32 v39, v2, v1, 16
-; SI-NEXT:    v_alignbit_b32 v50, v18, v17, 16
-; SI-NEXT:    v_alignbit_b32 v52, v20, v19, 16
-; SI-NEXT:    v_alignbit_b32 v55, v22, v21, 16
-; SI-NEXT:    v_alignbit_b32 v41, v24, v23, 16
-; SI-NEXT:    v_alignbit_b32 v43, v26, v25, 16
+; SI-NEXT:    v_add_f64 v[11:12], v[11:12], 1.0
+; SI-NEXT:    v_lshr_b64 v[31:32], v[15:16], 16
+; SI-NEXT:    v_add_f64 v[9:10], v[9:10], 1.0
+; SI-NEXT:    v_lshr_b64 v[32:33], v[13:14], 16
+; SI-NEXT:    v_add_f64 v[7:8], v[7:8], 1.0
+; SI-NEXT:    v_lshr_b64 v[33:34], v[11:12], 16
+; SI-NEXT:    v_add_f64 v[5:6], v[5:6], 1.0
+; SI-NEXT:    v_add_f64 v[1:2], v[1:2], 1.0
+; SI-NEXT:    v_lshr_b64 v[34:35], v[9:10], 16
+; SI-NEXT:    v_add_f64 v[3:4], v[3:4], 1.0
+; SI-NEXT:    v_add_f64 v[23:24], v[23:24], 1.0
+; SI-NEXT:    v_lshr_b64 v[35:36], v[7:8], 16
+; SI-NEXT:    v_add_f64 v[17:18], v[17:18], 1.0
+; SI-NEXT:    v_add_f64 v[19:20], v[19:20], 1.0
+; SI-NEXT:    v_add_f64 v[25:26], v[25:26], 1.0
+; SI-NEXT:    v_lshr_b64 v[36:37], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[51:52], v[1:2], 16
+; SI-NEXT:    v_add_f64 v[21:22], v[21:22], 1.0
+; SI-NEXT:    v_add_f64 v[29:30], v[29:30], 1.0
+; SI-NEXT:    v_add_f64 v[27:28], v[27:28], 1.0
+; SI-NEXT:    v_lshr_b64 v[37:38], v[3:4], 16
+; SI-NEXT:    v_lshr_b64 v[52:53], v[23:24], 16
+; SI-NEXT:    v_lshr_b64 v[38:39], v[17:18], 16
+; SI-NEXT:    v_lshr_b64 v[48:49], v[19:20], 16
+; SI-NEXT:    v_lshr_b64 v[53:54], v[25:26], 16
+; SI-NEXT:    v_lshr_b64 v[49:50], v[21:22], 16
+; SI-NEXT:    v_lshr_b64 v[54:55], v[29:30], 16
+; SI-NEXT:    v_lshr_b64 v[39:40], v[27:28], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v41, 16, v16
+; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v14
+; SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v12
+; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v46, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v2
 ; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    v_alignbit_b32 v46, v30, v29, 16
+; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v18
+; SI-NEXT:    s_waitcnt expcnt(5)
+; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v20
 ; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_alignbit_b32 v56, v28, v27, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v38, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v49, 16, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v10
-; SI-NEXT:    v_lshrrev_b32_e32 v53, 16, v8
-; SI-NEXT:    v_lshrrev_b32_e32 v54, 16, v6
-; SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v42, 16, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v18
-; SI-NEXT:    v_lshrrev_b32_e32 v45, 16, v20
-; SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v22
+; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v22
 ; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_lshrrev_b32_e32 v57, 16, v24
+; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v24
 ; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    v_lshrrev_b32_e32 v58, 16, v26
+; SI-NEXT:    v_lshrrev_b32_e32 v61, 16, v26
 ; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v30
+; SI-NEXT:    v_lshrrev_b32_e32 v62, 16, v30
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v28
+; SI-NEXT:    v_lshrrev_b32_e32 v63, 16, v28
 ; SI-NEXT:  .LBB49_3: ; %end
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v56
+; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
 ; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; SI-NEXT:    v_or_b32_e32 v27, v27, v56
+; SI-NEXT:    v_or_b32_e32 v27, v27, v39
 ; SI-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v60
+; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v63
 ; SI-NEXT:    v_or_b32_e32 v27, v27, v28
 ; SI-NEXT:    v_add_i32_e32 v28, vcc, 4, v0
 ; SI-NEXT:    buffer_store_dword v27, v28, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v46
-; SI-NEXT:    v_or_b32_e32 v27, v27, v28
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v54
+; SI-NEXT:    v_and_b32_e32 v28, 0xffff, v29
+; SI-NEXT:    v_or_b32_e32 v27, v28, v27
 ; SI-NEXT:    v_add_i32_e32 v28, vcc, 8, v0
 ; SI-NEXT:    buffer_store_dword v27, v28, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v27, 0xffff, v30
-; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v59
+; SI-NEXT:    v_lshlrev_b32_e32 v28, 16, v62
 ; SI-NEXT:    v_or_b32_e32 v27, v27, v28
 ; SI-NEXT:    v_add_i32_e32 v28, vcc, 12, v0
 ; SI-NEXT:    buffer_store_dword v27, v28, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v25
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v43
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v53
+; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v25
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v27
 ; SI-NEXT:    v_add_i32_e32 v27, vcc, 16, v0
 ; SI-NEXT:    buffer_store_dword v25, v27, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v25, 0xffff, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v58
+; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v61
 ; SI-NEXT:    v_or_b32_e32 v25, v25, v26
 ; SI-NEXT:    v_add_i32_e32 v26, vcc, 20, v0
 ; SI-NEXT:    buffer_store_dword v25, v26, s[0:3], 0 offen
-; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v41
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v52
+; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v23
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v25
 ; SI-NEXT:    v_add_i32_e32 v25, vcc, 24, v0
 ; SI-NEXT:    buffer_store_dword v23, v25, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v23, 0xffff, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v57
+; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v60
 ; SI-NEXT:    v_or_b32_e32 v23, v23, v24
 ; SI-NEXT:    v_add_i32_e32 v24, vcc, 28, v0
 ; SI-NEXT:    buffer_store_dword v23, v24, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v21
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v55
+; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v49
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v23
 ; SI-NEXT:    v_add_i32_e32 v23, vcc, 32, v0
 ; SI-NEXT:    buffer_store_dword v21, v23, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v21, 0xffff, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v47
+; SI-NEXT:    v_lshlrev_b32_e32 v22, 16, v59
 ; SI-NEXT:    v_or_b32_e32 v21, v21, v22
 ; SI-NEXT:    v_add_i32_e32 v22, vcc, 36, v0
 ; SI-NEXT:    buffer_store_dword v21, v22, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v19
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v52
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v48
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v21
 ; SI-NEXT:    v_add_i32_e32 v21, vcc, 40, v0
 ; SI-NEXT:    buffer_store_dword v19, v21, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v19, 0xffff, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v45
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v58
 ; SI-NEXT:    v_or_b32_e32 v19, v19, v20
 ; SI-NEXT:    v_add_i32_e32 v20, vcc, 44, v0
 ; SI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v17
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v50
+; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v38
 ; SI-NEXT:    v_or_b32_e32 v17, v17, v19
 ; SI-NEXT:    v_add_i32_e32 v19, vcc, 48, v0
 ; SI-NEXT:    buffer_store_dword v17, v19, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v17, 0xffff, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v44
+; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v57
 ; SI-NEXT:    v_or_b32_e32 v17, v17, v18
 ; SI-NEXT:    v_add_i32_e32 v18, vcc, 52, v0
 ; SI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v39
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v51
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v17
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, 56, v0
 ; SI-NEXT:    buffer_store_dword v1, v17, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v42
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v56
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -37470,7 +37549,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v40
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v47
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -37482,7 +37561,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v54
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v46
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -37494,7 +37573,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v53
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v45
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -37506,7 +37585,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v10
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v51
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v44
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -37518,7 +37597,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v49
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v43
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -37530,7 +37609,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v14
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v48
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v42
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
@@ -37542,56 +37621,59 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a,
 ; SI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v38
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v41
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB49_4:
-; SI-NEXT:    ; implicit-def: $vgpr56
+; SI-NEXT:    ; implicit-def: $vgpr39
+; SI-NEXT:    ; implicit-def: $vgpr63
+; SI-NEXT:    ; implicit-def: $vgpr54
+; SI-NEXT:    ; implicit-def: $vgpr62
+; SI-NEXT:    ; implicit-def: $vgpr53
+; SI-NEXT:    ; implicit-def: $vgpr61
+; SI-NEXT:    ; implicit-def: $vgpr52
 ; SI-NEXT:    ; implicit-def: $vgpr60
-; SI-NEXT:    ; implicit-def: $vgpr46
+; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; implicit-def: $vgpr59
-; SI-NEXT:    ; implicit-def: $vgpr43
+; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr58
-; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    ; implicit-def: $vgpr38
 ; SI-NEXT:    ; implicit-def: $vgpr57
-; SI-NEXT:    ; implicit-def: $vgpr55
+; SI-NEXT:    ; implicit-def: $vgpr56
 ; SI-NEXT:    ; implicit-def: $vgpr47
-; SI-NEXT:    ; implicit-def: $vgpr52
+; SI-NEXT:    ; implicit-def: $vgpr46
 ; SI-NEXT:    ; implicit-def: $vgpr45
-; SI-NEXT:    ; implicit-def: $vgpr50
 ; SI-NEXT:    ; implicit-def: $vgpr44
-; SI-NEXT:    ; implicit-def: $vgpr39
+; SI-NEXT:    ; implicit-def: $vgpr43
 ; SI-NEXT:    ; implicit-def: $vgpr42
+; SI-NEXT:    ; implicit-def: $vgpr41
+; SI-NEXT:    ; implicit-def: $vgpr51
 ; SI-NEXT:    ; implicit-def: $vgpr37
-; SI-NEXT:    ; implicit-def: $vgpr40
 ; SI-NEXT:    ; implicit-def: $vgpr36
-; SI-NEXT:    ; implicit-def: $vgpr54
 ; SI-NEXT:    ; implicit-def: $vgpr35
-; SI-NEXT:    ; implicit-def: $vgpr53
 ; SI-NEXT:    ; implicit-def: $vgpr34
-; SI-NEXT:    ; implicit-def: $vgpr51
 ; SI-NEXT:    ; implicit-def: $vgpr33
-; SI-NEXT:    ; implicit-def: $vgpr49
 ; SI-NEXT:    ; implicit-def: $vgpr32
-; SI-NEXT:    ; implicit-def: $vgpr48
 ; SI-NEXT:    ; implicit-def: $vgpr31
-; SI-NEXT:    ; implicit-def: $vgpr38
 ; SI-NEXT:    s_branch .LBB49_2
 ;
 ; VI-LABEL: bitcast_v15f64_to_v60i16_scalar:
@@ -51097,566 +51179,651 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; SI-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
-; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32
-; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:4
-; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:8
-; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:16
-; SI-NEXT:    s_waitcnt expcnt(6)
-; SI-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:20
-; SI-NEXT:    s_waitcnt expcnt(5)
-; SI-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:24
-; SI-NEXT:    s_waitcnt expcnt(4)
-; SI-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:16
+; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:28
+; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:32
+; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:44
+; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:48
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:60
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:4
+; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:8
+; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:20
+; SI-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:24
+; SI-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:40
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:52
+; SI-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:56
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:32
-; SI-NEXT:    s_waitcnt expcnt(2)
-; SI-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:36
-; SI-NEXT:    s_waitcnt expcnt(1)
-; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:40
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v30
+; SI-NEXT:    v_cvt_f16_f32_e32 v30, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v24
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v37, v18
+; SI-NEXT:    v_cvt_f16_f32_e32 v54, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v63, v22
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v36, v23
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, v26
+; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:44
-; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:48
-; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:52
-; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:56
-; SI-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:60
-; SI-NEXT:    v_cvt_f16_f32_e32 v51, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v56, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v48, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, v6
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f16_f32_e32 v42, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v62, s16
+; SI-NEXT:    v_cvt_f16_f32_e32 v23, s19
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, s23
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v38, v10
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v41, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, v14
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v40, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v31, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, v19
-; SI-NEXT:    v_cvt_f16_f32_e32 v55, v20
-; SI-NEXT:    v_cvt_f16_f32_e32 v20, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, v24
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, v25
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v26
-; SI-NEXT:    v_cvt_f16_f32_e32 v22, v27
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, v28
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, v29
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, v30
-; SI-NEXT:    v_cvt_f16_f32_e32 v29, s22
-; SI-NEXT:    v_cvt_f16_f32_e32 v30, s23
-; SI-NEXT:    v_cvt_f16_f32_e32 v26, s26
-; SI-NEXT:    v_cvt_f16_f32_e32 v27, s27
-; SI-NEXT:    v_cvt_f16_f32_e32 v28, s28
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v42, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v26, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v40, v8
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v22, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v47, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v18, v13
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v52, v16
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, v17
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v21
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v25
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v28
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v29
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f16_f32_e32 v58, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v28, s26
 ; SI-NEXT:    s_waitcnt vmcnt(14)
-; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v33
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v23
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, v43
-; SI-NEXT:    v_cvt_f16_f32_e32 v24, v44
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v45
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v32
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v33
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v34
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v35
+; SI-NEXT:    v_cvt_f16_f32_e32 v59, v38
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v39
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v48
+; SI-NEXT:    v_cvt_f16_f32_e32 v48, s20
+; SI-NEXT:    v_cvt_f16_f32_e32 v39, v50
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v49
+; SI-NEXT:    v_cvt_f16_f32_e32 v34, s24
+; SI-NEXT:    v_cvt_f16_f32_e32 v49, s27
+; SI-NEXT:    v_cvt_f16_f32_e32 v32, s28
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(1)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v51
+; SI-NEXT:    v_cvt_f16_f32_e32 v50, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v57, v53
+; SI-NEXT:    s_waitcnt vmcnt(14)
+; SI-NEXT:    v_cvt_f16_f32_e32 v55, v55
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v41
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, v43
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v44
 ; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, v46
-; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v47
-; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_cvt_f16_f32_e32 v23, v57
-; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v58
-; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v59
-; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v60
-; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_cvt_f16_f32_e32 v19, v61
-; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v62
-; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v36
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v63
-; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v39
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_cvt_f16_f32_e32 v62, v50
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:    s_waitcnt vmcnt(2) expcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v53
-; SI-NEXT:    v_cvt_f16_f32_e32 v47, s16
-; SI-NEXT:    v_cvt_f16_f32_e32 v60, s17
-; SI-NEXT:    v_cvt_f16_f32_e32 v44, s18
-; SI-NEXT:    v_cvt_f16_f32_e32 v45, s19
-; SI-NEXT:    v_cvt_f16_f32_e32 v46, s20
-; SI-NEXT:    v_cvt_f16_f32_e32 v59, s21
-; SI-NEXT:    v_cvt_f16_f32_e32 v43, s24
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, s25
-; SI-NEXT:    v_cvt_f16_f32_e32 v57, s29
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, v45
+; SI-NEXT:    v_cvt_f16_f32_e32 v35, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v38, s21
+; SI-NEXT:    v_cvt_f16_f32_e32 v33, s22
+; SI-NEXT:    v_cvt_f16_f32_e32 v44, s29
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
-; SI-NEXT:    s_cbranch_scc0 .LBB59_2
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v46
+; SI-NEXT:    v_cvt_f16_f32_e32 v46, s25
+; SI-NEXT:    s_cbranch_scc0 .LBB59_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_mov_b64 s[4:5], 0
-; SI-NEXT:    s_branch .LBB59_3
-; SI-NEXT:  .LBB59_2:
-; SI-NEXT:    s_mov_b64 s[4:5], -1
-; SI-NEXT:  .LBB59_3: ; %Flow
-; SI-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
-; SI-NEXT:    v_mov_b32_e32 v61, v14
-; SI-NEXT:    v_mov_b32_e32 v63, v15
-; SI-NEXT:    v_mov_b32_e32 v15, v18
-; SI-NEXT:    v_mov_b32_e32 v18, v22
-; SI-NEXT:    v_mov_b32_e32 v22, v33
-; SI-NEXT:    v_mov_b32_e32 v33, v11
-; SI-NEXT:    v_mov_b32_e32 v11, v8
-; SI-NEXT:    v_mov_b32_e32 v8, v5
-; SI-NEXT:    v_mov_b32_e32 v5, v42
-; SI-NEXT:    v_mov_b32_e32 v42, v1
-; SI-NEXT:    s_cbranch_vccnz .LBB59_5
-; SI-NEXT:  ; %bb.4: ; %cmp.true
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    s_cbranch_execnz .LBB59_3
+; SI-NEXT:  .LBB59_2: ; %cmp.true
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v23
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v35
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v58
+; SI-NEXT:    v_mov_b32_e32 v43, v34
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v23, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v23
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_or_b32_e32 v35, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v19
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v33
+; SI-NEXT:    v_lshlrev_b32_e32 v34, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v38
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v19, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v19
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_or_b32_e32 v33, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v49
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v28
+; SI-NEXT:    v_mov_b32_e32 v41, v32
+; SI-NEXT:    v_lshlrev_b32_e32 v32, 16, v1
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v49, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v46
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v49
+; SI-NEXT:    v_mov_b32_e32 v53, v27
+; SI-NEXT:    v_or_b32_e32 v28, v3, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v30
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_mov_b32_e32 v38, v55
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v44
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v62
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v29, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v42
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
-; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; SI-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v40
+; SI-NEXT:    v_mov_b32_e32 v40, v54
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v21, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v47
+; SI-NEXT:    v_lshr_b64 v[46:47], v[27:28], 16
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v52
+; SI-NEXT:    v_mov_b32_e32 v52, v15
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v15, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v50
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v13, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_or_b32_e32 v30, v3, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v26
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
-; SI-NEXT:    v_add_f32_e32 v7, 0x38000000, v7
-; SI-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; SI-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-NEXT:    v_add_f32_e32 v6, 0x38000000, v6
-; SI-NEXT:    v_add_f32_e32 v10, 0x38000000, v10
-; SI-NEXT:    v_cvt_f32_f16_e32 v20, v20
-; SI-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; SI-NEXT:    v_or_b32_e32 v62, v1, v39
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT:    v_add_f32_e32 v9, 0x38000000, v9
-; SI-NEXT:    v_add_f32_e32 v13, 0x38000000, v13
-; SI-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; SI-NEXT:    v_cvt_f32_f16_e32 v35, v35
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; SI-NEXT:    v_add_f32_e32 v12, 0x38000000, v12
-; SI-NEXT:    v_add_f32_e32 v17, 0x38000000, v17
-; SI-NEXT:    v_cvt_f32_f16_e32 v34, v34
-; SI-NEXT:    v_cvt_f32_f16_e32 v38, v38
-; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v4
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; SI-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; SI-NEXT:    v_add_f32_e32 v16, 0x38000000, v16
-; SI-NEXT:    v_add_f32_e32 v21, 0x38000000, v21
-; SI-NEXT:    v_cvt_f32_f16_e32 v37, v37
-; SI-NEXT:    v_or_b32_e32 v3, v3, v39
-; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v7
-; SI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; SI-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_or_b32_e32 v26, v3, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v22
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_or_b32_e32 v22, v3, v5
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v18
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_or_b32_e32 v18, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v37
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v16
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_mov_b32_e32 v51, v11
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v55, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v55
+; SI-NEXT:    v_or_b32_e32 v16, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v63
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v14
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v37, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v37
+; SI-NEXT:    v_or_b32_e32 v14, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v61
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v12
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v63, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v63
+; SI-NEXT:    v_or_b32_e32 v12, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v60
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v10
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v61, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v61
+; SI-NEXT:    v_or_b32_e32 v10, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v57
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
+; SI-NEXT:    v_or_b32_e32 v57, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v59
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; SI-NEXT:    v_lshr_b64 v[58:59], v[34:35], 16
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v50
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v8
+; SI-NEXT:    v_mov_b32_e32 v8, v48
+; SI-NEXT:    v_cvt_f16_f32_e32 v48, v5
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v48
+; SI-NEXT:    v_or_b32_e32 v6, v3, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v44
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v31
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v60, v4
+; SI-NEXT:    v_mov_b32_e32 v59, v48
+; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v38
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v60
+; SI-NEXT:    v_or_b32_e32 v4, v3, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v38, v43
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshr_b64 v[47:48], v[17:18], 16
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshr_b64 v[44:45], v[29:30], 16
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v24
+; SI-NEXT:    v_cvt_f32_f16_e32 v24, v8
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v24
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v20
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v39
+; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
+; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v31, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v20, 16, v31
+; SI-NEXT:    v_or_b32_e32 v2, v2, v20
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v62
+; SI-NEXT:    v_or_b32_e32 v62, v24, v32
+; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v38
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
 ; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
-; SI-NEXT:    v_add_f32_e32 v32, 0x38000000, v32
-; SI-NEXT:    v_or_b32_e32 v6, v6, v39
-; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v10
 ; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
-; SI-NEXT:    v_cvt_f16_f32_e32 v32, v32
-; SI-NEXT:    v_add_f32_e32 v31, 0x38000000, v31
-; SI-NEXT:    v_add_f32_e32 v35, 0x38000000, v35
-; SI-NEXT:    v_or_b32_e32 v9, v9, v39
-; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v13
-; SI-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; SI-NEXT:    v_cvt_f16_f32_e32 v35, v35
-; SI-NEXT:    v_add_f32_e32 v34, 0x38000000, v34
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v42, v24, v27
+; SI-NEXT:    v_mov_b32_e32 v48, v62
+; SI-NEXT:    v_or_b32_e32 v50, v20, v34
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v41
+; SI-NEXT:    v_mov_b32_e32 v34, v42
+; SI-NEXT:    v_lshr_b64 v[42:43], v[25:26], 16
+; SI-NEXT:    v_mov_b32_e32 v62, v50
+; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; SI-NEXT:    v_or_b32_e32 v41, v20, v29
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v38, v8
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; SI-NEXT:    v_add_f32_e32 v38, 0x38000000, v38
-; SI-NEXT:    v_or_b32_e32 v12, v12, v39
-; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v17
-; SI-NEXT:    v_cvt_f16_f32_e32 v34, v34
 ; SI-NEXT:    v_cvt_f16_f32_e32 v38, v38
-; SI-NEXT:    v_add_f32_e32 v37, 0x38000000, v37
-; SI-NEXT:    v_or_b32_e32 v16, v16, v39
-; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v21
-; SI-NEXT:    v_cvt_f16_f32_e32 v37, v37
-; SI-NEXT:    v_or_b32_e32 v20, v20, v39
-; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v32
-; SI-NEXT:    v_or_b32_e32 v31, v31, v39
-; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v35
-; SI-NEXT:    v_or_b32_e32 v34, v34, v39
-; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v38
-; SI-NEXT:    v_or_b32_e32 v37, v37, v39
-; SI-NEXT:    v_cvt_f32_f16_e32 v39, v49
-; SI-NEXT:    v_cvt_f32_f16_e32 v48, v48
-; SI-NEXT:    v_cvt_f32_f16_e32 v50, v51
-; SI-NEXT:    v_cvt_f32_f16_e32 v27, v27
-; SI-NEXT:    v_add_f32_e32 v39, 0x38000000, v39
-; SI-NEXT:    v_cvt_f16_f32_e32 v49, v39
-; SI-NEXT:    v_add_f32_e32 v39, 0x38000000, v48
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
-; SI-NEXT:    v_cvt_f32_f16_e32 v26, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v48, 16, v49
-; SI-NEXT:    v_cvt_f32_f16_e32 v30, v30
-; SI-NEXT:    v_or_b32_e32 v48, v39, v48
-; SI-NEXT:    v_cvt_f32_f16_e32 v39, v52
-; SI-NEXT:    v_cvt_f32_f16_e32 v29, v29
-; SI-NEXT:    v_add_f32_e32 v27, 0x38000000, v27
-; SI-NEXT:    v_cvt_f16_f32_e32 v27, v27
-; SI-NEXT:    v_add_f32_e32 v39, 0x38000000, v39
-; SI-NEXT:    v_cvt_f16_f32_e32 v52, v39
-; SI-NEXT:    v_add_f32_e32 v39, 0x38000000, v50
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
-; SI-NEXT:    v_add_f32_e32 v26, 0x38000000, v26
-; SI-NEXT:    v_add_f32_e32 v30, 0x38000000, v30
-; SI-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; SI-NEXT:    v_cvt_f16_f32_e32 v30, v30
-; SI-NEXT:    v_add_f32_e32 v29, 0x38000000, v29
-; SI-NEXT:    v_cvt_f16_f32_e32 v29, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v52
-; SI-NEXT:    v_or_b32_e32 v51, v39, v50
-; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
-; SI-NEXT:    v_or_b32_e32 v26, v26, v39
-; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v30
-; SI-NEXT:    v_or_b32_e32 v29, v29, v39
-; SI-NEXT:    v_cvt_f32_f16_e32 v39, v45
-; SI-NEXT:    v_cvt_f32_f16_e32 v50, v44
-; SI-NEXT:    v_cvt_f32_f16_e32 v53, v59
-; SI-NEXT:    v_cvt_f32_f16_e32 v56, v56
-; SI-NEXT:    v_add_f32_e32 v39, 0x38000000, v39
-; SI-NEXT:    v_cvt_f16_f32_e32 v45, v39
-; SI-NEXT:    v_add_f32_e32 v39, 0x38000000, v50
-; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
-; SI-NEXT:    v_add_f32_e32 v53, 0x38000000, v53
-; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v45
-; SI-NEXT:    v_cvt_f16_f32_e32 v53, v53
-; SI-NEXT:    v_or_b32_e32 v44, v39, v50
-; SI-NEXT:    v_cvt_f32_f16_e32 v39, v60
-; SI-NEXT:    v_cvt_f32_f16_e32 v50, v47
-; SI-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
-; SI-NEXT:    v_add_f32_e32 v56, 0x38000000, v56
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v24, v8
+; SI-NEXT:    v_or_b32_e32 v8, v38, v25
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v24
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v8
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v38, v8
+; SI-NEXT:    v_or_b32_e32 v8, v24, v21
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_or_b32_e32 v8, v20, v17
+; SI-NEXT:    v_add_f32_e32 v38, 0x38000000, v38
+; SI-NEXT:    v_cvt_f16_f32_e32 v38, v38
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; SI-NEXT:    v_cvt_f32_f16_e32 v24, v40
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v36
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_or_b32_e32 v8, v38, v15
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    v_cvt_f32_f16_e32 v38, v53
+; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v24
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
+; SI-NEXT:    v_add_f32_e32 v38, 0x38000000, v38
+; SI-NEXT:    v_cvt_f16_f32_e32 v38, v38
+; SI-NEXT:    v_or_b32_e32 v54, v24, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v24, v52
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; SI-NEXT:    v_or_b32_e32 v52, v38, v9
+; SI-NEXT:    v_mov_b32_e32 v27, v52
+; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v24
+; SI-NEXT:    v_or_b32_e32 v36, v20, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v20, v51
+; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; SI-NEXT:    v_lshr_b64 v[52:53], v[15:16], 16
+; SI-NEXT:    v_add_f32_e32 v20, 0x38000000, v20
+; SI-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; SI-NEXT:    v_or_b32_e32 v51, v24, v56
+; SI-NEXT:    v_mov_b32_e32 v15, v51
+; SI-NEXT:    v_lshr_b64 v[50:51], v[13:14], 16
+; SI-NEXT:    v_or_b32_e32 v24, v20, v5
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v38, v8
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    v_add_f32_e32 v38, 0x38000000, v38
+; SI-NEXT:    v_cvt_f16_f32_e32 v38, v38
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v39, v8
+; SI-NEXT:    v_or_b32_e32 v8, v38, v3
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; SI-NEXT:    v_add_f32_e32 v39, 0x38000000, v39
 ; SI-NEXT:    v_cvt_f16_f32_e32 v39, v39
-; SI-NEXT:    v_add_f32_e32 v50, 0x38000000, v50
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
-; SI-NEXT:    v_cvt_f16_f32_e32 v56, v56
-; SI-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
-; SI-NEXT:    v_cvt_f32_f16_e32 v23, v23
-; SI-NEXT:    v_or_b32_e32 v47, v50, v39
-; SI-NEXT:    v_cvt_f32_f16_e32 v50, v46
-; SI-NEXT:    v_cvt_f32_f16_e32 v46, v58
-; SI-NEXT:    v_lshlrev_b32_e32 v56, 16, v56
-; SI-NEXT:    v_add_f32_e32 v23, 0x38000000, v23
-; SI-NEXT:    v_add_f32_e32 v50, 0x38000000, v50
-; SI-NEXT:    v_add_f32_e32 v46, 0x38000000, v46
-; SI-NEXT:    v_cvt_f16_f32_e32 v50, v50
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v46
-; SI-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; SI-NEXT:    v_cvt_f32_f16_e32 v57, v57
-; SI-NEXT:    v_or_b32_e32 v46, v50, v53
-; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v58
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_or_b32_e32 v8, v39, v1
+; SI-NEXT:    v_lshr_b64 v[38:39], v[32:33], 16
+; SI-NEXT:    v_mov_b32_e32 v32, v41
+; SI-NEXT:    v_lshr_b64 v[40:41], v[21:22], 16
+; SI-NEXT:    v_lshr_b64 v[20:21], v[11:12], 16
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[20:21], v[56:57], 16
+; SI-NEXT:    buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; SI-NEXT:    v_mov_b32_e32 v11, v24
+; SI-NEXT:    v_lshr_b64 v[8:9], v[9:10], 16
+; SI-NEXT:    v_mov_b32_e32 v39, v31
+; SI-NEXT:    v_mov_b32_e32 v31, v60
+; SI-NEXT:    v_mov_b32_e32 v60, v61
+; SI-NEXT:    v_mov_b32_e32 v61, v63
+; SI-NEXT:    v_mov_b32_e32 v63, v37
+; SI-NEXT:    v_mov_b32_e32 v37, v55
+; SI-NEXT:    v_lshr_b64 v[55:56], v[5:6], 16
+; SI-NEXT:    v_lshr_b64 v[24:25], v[3:4], 16
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshr_b64 v[20:21], v[1:2], 16
+; SI-NEXT:  .LBB59_3: ; %end
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v58
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v62
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v35
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v23
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 4, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v38
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v48
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 8, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v33
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v19
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v46
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v34
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 16, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v28
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v49
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v44
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v32
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v30
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v42
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v26
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 36, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v40
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; SI-NEXT:    v_or_b32_e32 v1, v3, v1
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 40, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v22
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 44, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v47
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v18
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v52
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 56, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v16
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v37
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v54
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v50
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 64, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v14
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v63
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x44, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v36
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v58, v14
-; SI-NEXT:    v_cvt_f32_f16_e32 v42, v42
-; SI-NEXT:    v_cvt_f32_f16_e32 v41, v41
-; SI-NEXT:    v_cvt_f32_f16_e32 v40, v40
-; SI-NEXT:    v_add_f32_e32 v58, 0x38000000, v58
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v58
-; SI-NEXT:    v_cvt_f32_f16_e32 v55, v55
-; SI-NEXT:    v_cvt_f32_f16_e32 v54, v54
-; SI-NEXT:    v_cvt_f32_f16_e32 v25, v25
-; SI-NEXT:    v_or_b32_e32 v14, v58, v56
-; SI-NEXT:    v_cvt_f32_f16_e32 v58, v63
-; SI-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; SI-NEXT:    v_cvt_f32_f16_e32 v24, v24
-; SI-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; SI-NEXT:    v_add_f32_e32 v58, 0x38000000, v58
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v58
-; SI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v13
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x48, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v14, v36
-; SI-NEXT:    v_cvt_f32_f16_e32 v43, v43
-; SI-NEXT:    v_cvt_f32_f16_e32 v28, v28
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; SI-NEXT:    v_cvt_f32_f16_e32 v33, v33
-; SI-NEXT:    v_cvt_f32_f16_e32 v22, v22
-; SI-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; SI-NEXT:    v_or_b32_e32 v63, v58, v23
-; SI-NEXT:    v_cvt_f32_f16_e32 v58, v61
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v12
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v61
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x4c, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v27
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x50, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v10
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v60
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x54, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v15
 ; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_cvt_f32_f16_e32 v36, v1
-; SI-NEXT:    v_add_f32_e32 v57, 0x38000000, v57
-; SI-NEXT:    v_add_f32_e32 v42, 0x38000000, v42
-; SI-NEXT:    v_add_f32_e32 v41, 0x38000000, v41
-; SI-NEXT:    v_add_f32_e32 v40, 0x38000000, v40
-; SI-NEXT:    v_add_f32_e32 v55, 0x38000000, v55
-; SI-NEXT:    v_add_f32_e32 v54, 0x38000000, v54
-; SI-NEXT:    v_add_f32_e32 v25, 0x38000000, v25
-; SI-NEXT:    v_add_f32_e32 v24, 0x38000000, v24
-; SI-NEXT:    v_add_f32_e32 v19, 0x38000000, v19
-; SI-NEXT:    v_add_f32_e32 v14, 0x38000000, v14
-; SI-NEXT:    v_add_f32_e32 v43, 0x38000000, v43
-; SI-NEXT:    v_cvt_f16_f32_e32 v57, v57
-; SI-NEXT:    v_add_f32_e32 v28, 0x38000000, v28
-; SI-NEXT:    v_cvt_f16_f32_e32 v42, v42
-; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v41, v41
-; SI-NEXT:    v_add_f32_e32 v8, 0x38000000, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v40, v40
-; SI-NEXT:    v_add_f32_e32 v11, 0x38000000, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v55, v55
-; SI-NEXT:    v_add_f32_e32 v33, 0x38000000, v33
-; SI-NEXT:    v_cvt_f16_f32_e32 v54, v54
-; SI-NEXT:    v_add_f32_e32 v22, 0x38000000, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v25, v25
-; SI-NEXT:    v_add_f32_e32 v18, 0x38000000, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v24, v24
-; SI-NEXT:    v_add_f32_e32 v15, 0x38000000, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; SI-NEXT:    v_add_f32_e32 v58, 0x38000000, v58
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NEXT:    v_add_f32_e32 v36, 0x38000000, v36
-; SI-NEXT:    v_cvt_f16_f32_e32 v43, v43
-; SI-NEXT:    v_cvt_f16_f32_e32 v28, v28
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT:    v_cvt_f16_f32_e32 v33, v33
-; SI-NEXT:    v_cvt_f16_f32_e32 v22, v22
-; SI-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-NEXT:    v_cvt_f16_f32_e32 v58, v58
-; SI-NEXT:    v_cvt_f16_f32_e32 v36, v36
-; SI-NEXT:    v_lshlrev_b32_e32 v57, 16, v57
-; SI-NEXT:    v_lshlrev_b32_e32 v42, 16, v42
-; SI-NEXT:    v_lshlrev_b32_e32 v41, 16, v41
-; SI-NEXT:    v_lshlrev_b32_e32 v40, 16, v40
-; SI-NEXT:    v_lshlrev_b32_e32 v55, 16, v55
-; SI-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
-; SI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; SI-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
-; SI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; SI-NEXT:    v_or_b32_e32 v43, v43, v50
-; SI-NEXT:    v_or_b32_e32 v28, v28, v57
-; SI-NEXT:    v_or_b32_e32 v5, v5, v42
-; SI-NEXT:    v_or_b32_e32 v8, v8, v41
-; SI-NEXT:    v_or_b32_e32 v11, v11, v40
-; SI-NEXT:    v_or_b32_e32 v33, v33, v55
-; SI-NEXT:    v_or_b32_e32 v22, v22, v54
-; SI-NEXT:    v_or_b32_e32 v18, v18, v25
-; SI-NEXT:    v_or_b32_e32 v15, v15, v24
-; SI-NEXT:    v_or_b32_e32 v61, v58, v19
-; SI-NEXT:    v_or_b32_e32 v1, v36, v14
-; SI-NEXT:    v_alignbit_b32 v60, v44, v39, 16
-; SI-NEXT:    v_alignbit_b32 v59, v29, v53, 16
-; SI-NEXT:    v_alignbit_b32 v58, v26, v50, 16
-; SI-NEXT:    v_alignbit_b32 v57, v51, v57, 16
-; SI-NEXT:    v_alignbit_b32 v56, v48, v56, 16
-; SI-NEXT:    v_alignbit_b32 v42, v37, v42, 16
-; SI-NEXT:    v_alignbit_b32 v41, v34, v41, 16
-; SI-NEXT:    v_alignbit_b32 v40, v31, v40, 16
-; SI-NEXT:    v_alignbit_b32 v55, v20, v55, 16
-; SI-NEXT:    v_alignbit_b32 v54, v16, v54, 16
-; SI-NEXT:    v_alignbit_b32 v25, v12, v25, 16
-; SI-NEXT:    v_alignbit_b32 v24, v9, v24, 16
-; SI-NEXT:    v_alignbit_b32 v23, v6, v23, 16
-; SI-NEXT:    v_alignbit_b32 v19, v3, v19, 16
-; SI-NEXT:    v_alignbit_b32 v36, v62, v14, 16
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; SI-NEXT:  .LBB59_5: ; %end
-; SI-NEXT:    v_and_b32_e32 v39, 0xffff, v47
-; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v60
-; SI-NEXT:    v_or_b32_e32 v39, v39, v50
-; SI-NEXT:    buffer_store_dword v39, v0, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v39, 0xffff, v44
-; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v45
-; SI-NEXT:    v_or_b32_e32 v39, v39, v50
-; SI-NEXT:    v_add_i32_e32 v50, vcc, 4, v0
-; SI-NEXT:    buffer_store_dword v39, v50, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v39, 0xffff, v46
-; SI-NEXT:    v_lshlrev_b32_e32 v50, 16, v59
-; SI-NEXT:    v_and_b32_e32 v29, 0xffff, v29
-; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; SI-NEXT:    v_or_b32_e32 v39, v39, v50
-; SI-NEXT:    v_add_i32_e32 v50, vcc, 8, v0
-; SI-NEXT:    v_or_b32_e32 v29, v29, v30
-; SI-NEXT:    v_add_i32_e32 v30, vcc, 12, v0
-; SI-NEXT:    buffer_store_dword v39, v50, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v29, v30, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v29, 0xffff, v43
-; SI-NEXT:    v_lshlrev_b32_e32 v30, 16, v58
-; SI-NEXT:    v_and_b32_e32 v26, 0xffff, v26
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; SI-NEXT:    v_or_b32_e32 v29, v29, v30
-; SI-NEXT:    v_add_i32_e32 v30, vcc, 16, v0
-; SI-NEXT:    v_or_b32_e32 v26, v26, v27
-; SI-NEXT:    v_add_i32_e32 v27, vcc, 20, v0
-; SI-NEXT:    buffer_store_dword v29, v30, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v26, v27, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v26, 0xffff, v28
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v57
-; SI-NEXT:    v_or_b32_e32 v26, v26, v27
-; SI-NEXT:    v_add_i32_e32 v27, vcc, 24, v0
-; SI-NEXT:    buffer_store_dword v26, v27, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v26, 0xffff, v51
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v52
-; SI-NEXT:    v_or_b32_e32 v26, v26, v27
-; SI-NEXT:    v_add_i32_e32 v27, vcc, 28, v0
-; SI-NEXT:    buffer_store_dword v26, v27, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v56
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; SI-NEXT:    v_or_b32_e32 v3, v3, v4
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x6c, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v26, 0xffff, v14
-; SI-NEXT:    v_or_b32_e32 v26, v26, v27
-; SI-NEXT:    v_add_i32_e32 v27, vcc, 32, v0
-; SI-NEXT:    buffer_store_dword v26, v27, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v26, 0xffff, v48
-; SI-NEXT:    v_lshlrev_b32_e32 v27, 16, v49
-; SI-NEXT:    v_or_b32_e32 v26, v26, v27
-; SI-NEXT:    v_add_i32_e32 v27, vcc, 36, v0
-; SI-NEXT:    buffer_store_dword v26, v27, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v42
-; SI-NEXT:    v_or_b32_e32 v5, v5, v26
-; SI-NEXT:    v_add_i32_e32 v26, vcc, 40, v0
-; SI-NEXT:    buffer_store_dword v5, v26, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v37
-; SI-NEXT:    v_lshlrev_b32_e32 v26, 16, v38
-; SI-NEXT:    v_or_b32_e32 v5, v5, v26
-; SI-NEXT:    v_add_i32_e32 v26, vcc, 44, v0
-; SI-NEXT:    buffer_store_dword v5, v26, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v8
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v41
-; SI-NEXT:    v_or_b32_e32 v5, v5, v8
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 48, v0
-; SI-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v34
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v35
-; SI-NEXT:    v_or_b32_e32 v5, v5, v8
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 52, v0
-; SI-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v11
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v40
-; SI-NEXT:    v_or_b32_e32 v5, v5, v8
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 56, v0
-; SI-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v31
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v32
-; SI-NEXT:    v_or_b32_e32 v5, v5, v8
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 60, v0
-; SI-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v33
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v55
-; SI-NEXT:    v_or_b32_e32 v5, v5, v8
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 64, v0
-; SI-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v20
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v21
-; SI-NEXT:    v_or_b32_e32 v5, v5, v8
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x44, v0
-; SI-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v22
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v54
-; SI-NEXT:    v_or_b32_e32 v5, v5, v8
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x48, v0
-; SI-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v16
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v17
-; SI-NEXT:    v_or_b32_e32 v5, v5, v8
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x4c, v0
-; SI-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v18
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v25
-; SI-NEXT:    v_or_b32_e32 v5, v5, v8
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x50, v0
-; SI-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v12
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v13
-; SI-NEXT:    v_or_b32_e32 v5, v5, v8
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x54, v0
-; SI-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v15
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v24
-; SI-NEXT:    v_or_b32_e32 v5, v5, v8
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x58, v0
-; SI-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v9
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
-; SI-NEXT:    v_or_b32_e32 v5, v5, v8
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x5c, v0
-; SI-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v63
-; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v23
-; SI-NEXT:    v_or_b32_e32 v5, v5, v8
-; SI-NEXT:    v_add_i32_e32 v8, vcc, 0x60, v0
-; SI-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x64, v0
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v8
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x58, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v61
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 0x68, v0
-; SI-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
-; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v36
-; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v1
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v62
-; SI-NEXT:    v_or_b32_e32 v3, v3, v4
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x70, v0
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v57
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x5c, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v11
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v55
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x60, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v59
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v24
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x68, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v31
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x6c, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v20
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 0x70, v0
+; SI-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v39
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x74, v0
-; SI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
 ; SI-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; SI-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
@@ -51676,6 +51843,8 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
+; SI-NEXT:  .LBB59_4:
+; SI-NEXT:    s_branch .LBB59_2
 ;
 ; VI-LABEL: bitcast_v60f16_to_v60i16_scalar:
 ; VI:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
index 4aded5da3668a..685e2fbdecfad 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
@@ -632,51 +632,53 @@ define inreg <12 x i8> @bitcast_v3i32_to_v12i8_scalar(<3 x i32> inreg %a, i32 in
 ; SI-NEXT:    s_cmp_lg_u32 s19, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB5_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v11, s4, v0, 24
-; SI-NEXT:    v_alignbit_b32 v10, s4, v0, 16
-; SI-NEXT:    v_alignbit_b32 v9, s4, v0, 8
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v3, s17, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, s17, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 8
-; SI-NEXT:    s_lshr_b32 s6, s17, 24
-; SI-NEXT:    s_lshr_b32 s7, s17, 16
-; SI-NEXT:    s_lshr_b32 s8, s17, 8
+; SI-NEXT:    s_lshr_b32 s19, s17, 24
+; SI-NEXT:    s_lshr_b32 s22, s17, 16
+; SI-NEXT:    s_lshr_b32 s23, s17, 8
+; SI-NEXT:    s_lshr_b64 s[6:7], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[10:11], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[4:5], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[8:9], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[16:17], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB5_3
 ; SI-NEXT:  .LBB5_2: ; %cmp.true
-; SI-NEXT:    s_add_i32 s16, s16, 3
-; SI-NEXT:    s_add_i32 s18, s18, 3
 ; SI-NEXT:    s_add_i32 s17, s17, 3
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v3, s17, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, s17, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 8
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v11, s4, v0, 24
-; SI-NEXT:    v_alignbit_b32 v10, s4, v0, 16
-; SI-NEXT:    v_alignbit_b32 v9, s4, v0, 8
-; SI-NEXT:    s_lshr_b32 s6, s17, 24
-; SI-NEXT:    s_lshr_b32 s7, s17, 16
-; SI-NEXT:    s_lshr_b32 s8, s17, 8
+; SI-NEXT:    s_add_i32 s18, s18, 3
+; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    s_lshr_b32 s19, s17, 24
+; SI-NEXT:    s_lshr_b64 s[4:5], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[8:9], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[12:13], s[16:17], 8
+; SI-NEXT:    s_lshr_b32 s22, s17, 16
+; SI-NEXT:    s_lshr_b32 s23, s17, 8
+; SI-NEXT:    s_lshr_b64 s[6:7], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[10:11], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[18:19], 8
 ; SI-NEXT:  .LBB5_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
+; SI-NEXT:    v_mov_b32_e32 v1, s12
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v3, s4
 ; SI-NEXT:    v_mov_b32_e32 v4, s17
-; SI-NEXT:    v_mov_b32_e32 v5, s8
-; SI-NEXT:    v_mov_b32_e32 v6, s7
-; SI-NEXT:    v_mov_b32_e32 v7, s6
+; SI-NEXT:    v_mov_b32_e32 v5, s23
+; SI-NEXT:    v_mov_b32_e32 v6, s22
+; SI-NEXT:    v_mov_b32_e32 v7, s19
 ; SI-NEXT:    v_mov_b32_e32 v8, s18
+; SI-NEXT:    v_mov_b32_e32 v9, s14
+; SI-NEXT:    v_mov_b32_e32 v10, s10
+; SI-NEXT:    v_mov_b32_e32 v11, s6
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB5_4:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
+; SI-NEXT:    ; implicit-def: $sgpr12
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $sgpr7
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr23
+; SI-NEXT:    ; implicit-def: $sgpr22
+; SI-NEXT:    ; implicit-def: $sgpr19
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr10
 ; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr11
 ; SI-NEXT:    s_branch .LBB5_2
 ;
 ; VI-LABEL: bitcast_v3i32_to_v12i8_scalar:
@@ -3133,31 +3135,29 @@ define inreg <6 x i16> @bitcast_v3i32_to_v6i16_scalar(<3 x i32> inreg %a, i32 in
 ; SI-NEXT:    s_cmp_lg_u32 s19, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB17_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s4, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s17, 16
+; SI-NEXT:    s_lshr_b32 s10, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB17_3
 ; SI-NEXT:  .LBB17_2: ; %cmp.true
-; SI-NEXT:    s_add_i32 s16, s16, 3
 ; SI-NEXT:    s_add_i32 s18, s18, 3
 ; SI-NEXT:    s_add_i32 s17, s17, 3
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s4, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s17, 16
+; SI-NEXT:    s_add_i32 s16, s16, 3
+; SI-NEXT:    s_lshr_b64 s[6:7], s[16:17], 16
+; SI-NEXT:    s_lshr_b32 s10, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[18:19], 16
 ; SI-NEXT:  .LBB17_3: ; %end
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
+; SI-NEXT:    v_mov_b32_e32 v1, s6
 ; SI-NEXT:    v_mov_b32_e32 v2, s17
-; SI-NEXT:    v_mov_b32_e32 v3, s6
+; SI-NEXT:    v_mov_b32_e32 v3, s10
 ; SI-NEXT:    v_mov_b32_e32 v4, s18
+; SI-NEXT:    v_mov_b32_e32 v5, s4
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB17_4:
-; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr4
 ; SI-NEXT:    s_branch .LBB17_2
 ;
 ; VI-LABEL: bitcast_v3i32_to_v6i16_scalar:
@@ -3762,50 +3762,59 @@ define inreg <12 x i8> @bitcast_v3f32_to_v12i8_scalar(<3 x float> inreg %a, i32
 ; SI-NEXT:    s_cmp_lg_u32 s19, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB21_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v11, s4, v0, 24
-; SI-NEXT:    v_alignbit_b32 v10, s4, v0, 16
-; SI-NEXT:    v_alignbit_b32 v9, s4, v0, 8
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v3, s17, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, s17, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 8
-; SI-NEXT:    s_lshr_b32 s6, s17, 24
-; SI-NEXT:    s_lshr_b32 s7, s17, 16
-; SI-NEXT:    s_lshr_b32 s8, s17, 8
+; SI-NEXT:    s_lshr_b32 s19, s17, 24
+; SI-NEXT:    s_lshr_b32 s22, s17, 16
+; SI-NEXT:    s_lshr_b32 s23, s17, 8
+; SI-NEXT:    s_lshr_b64 s[10:11], s[18:19], 24
+; SI-NEXT:    s_lshr_b64 s[12:13], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[14:15], s[18:19], 8
+; SI-NEXT:    s_lshr_b64 s[4:5], s[16:17], 24
+; SI-NEXT:    s_lshr_b64 s[6:7], s[16:17], 16
+; SI-NEXT:    s_lshr_b64 s[8:9], s[16:17], 8
 ; SI-NEXT:    s_cbranch_execnz .LBB21_4
 ; SI-NEXT:  .LBB21_2: ; %cmp.true
 ; SI-NEXT:    v_add_f32_e64 v8, s18, 1.0
-; SI-NEXT:    v_add_f32_e64 v4, s17, 1.0
-; SI-NEXT:    v_add_f32_e64 v0, s16, 1.0
-; SI-NEXT:    v_alignbit_b32 v3, v4, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, v4, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, v4, v0, 8
-; SI-NEXT:    v_alignbit_b32 v11, s4, v8, 24
-; SI-NEXT:    v_alignbit_b32 v10, s4, v8, 16
-; SI-NEXT:    v_alignbit_b32 v9, s4, v8, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
-; SI-NEXT:    s_setpc_b64 s[30:31]
+; SI-NEXT:    v_add_f32_e64 v17, s17, 1.0
+; SI-NEXT:    v_add_f32_e64 v16, s16, 1.0
+; SI-NEXT:    v_lshr_b64 v[11:12], v[8:9], 24
+; SI-NEXT:    v_lshr_b64 v[3:4], v[16:17], 24
+; SI-NEXT:    v_lshr_b64 v[14:15], v[16:17], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[16:17], 8
+; SI-NEXT:    v_lshr_b64 v[12:13], v[8:9], 16
+; SI-NEXT:    v_lshr_b64 v[9:10], v[8:9], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v17
+; SI-NEXT:    s_branch .LBB21_5
 ; SI-NEXT:  .LBB21_3:
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $sgpr7
 ; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr9
-; SI-NEXT:    ; implicit-def: $vgpr10
-; SI-NEXT:    ; implicit-def: $vgpr11
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr23
+; SI-NEXT:    ; implicit-def: $sgpr22
+; SI-NEXT:    ; implicit-def: $sgpr19
+; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr12
+; SI-NEXT:    ; implicit-def: $sgpr10
 ; SI-NEXT:    s_branch .LBB21_2
 ; SI-NEXT:  .LBB21_4:
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_mov_b32_e32 v4, s17
+; SI-NEXT:    v_mov_b32_e32 v16, s16
+; SI-NEXT:    v_mov_b32_e32 v17, s17
 ; SI-NEXT:    v_mov_b32_e32 v8, s18
-; SI-NEXT:    v_mov_b32_e32 v5, s8
-; SI-NEXT:    v_mov_b32_e32 v6, s7
-; SI-NEXT:    v_mov_b32_e32 v7, s6
+; SI-NEXT:    v_mov_b32_e32 v5, s23
+; SI-NEXT:    v_mov_b32_e32 v6, s22
+; SI-NEXT:    v_mov_b32_e32 v7, s19
+; SI-NEXT:    v_mov_b32_e32 v11, s10
+; SI-NEXT:    v_mov_b32_e32 v12, s12
+; SI-NEXT:    v_mov_b32_e32 v9, s14
+; SI-NEXT:    v_mov_b32_e32 v3, s4
+; SI-NEXT:    v_mov_b32_e32 v14, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s8
+; SI-NEXT:  .LBB21_5: ; %end
+; SI-NEXT:    v_mov_b32_e32 v0, v16
+; SI-NEXT:    v_mov_b32_e32 v2, v14
+; SI-NEXT:    v_mov_b32_e32 v4, v17
+; SI-NEXT:    v_mov_b32_e32 v10, v12
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v3f32_to_v12i8_scalar:
@@ -6282,30 +6291,33 @@ define inreg <6 x i16> @bitcast_v3f32_to_v6i16_scalar(<3 x float> inreg %a, i32
 ; SI-NEXT:    s_cmp_lg_u32 s19, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB33_3
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_mov_b32_e32 v0, s18
-; SI-NEXT:    v_alignbit_b32 v5, s4, v0, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_alignbit_b32 v1, s17, v0, 16
-; SI-NEXT:    s_lshr_b32 s6, s17, 16
+; SI-NEXT:    s_lshr_b32 s10, s17, 16
+; SI-NEXT:    s_lshr_b64 s[4:5], s[18:19], 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[16:17], 16
 ; SI-NEXT:    s_cbranch_execnz .LBB33_4
 ; SI-NEXT:  .LBB33_2: ; %cmp.true
 ; SI-NEXT:    v_add_f32_e64 v4, s18, 1.0
-; SI-NEXT:    v_add_f32_e64 v2, s17, 1.0
-; SI-NEXT:    v_add_f32_e64 v0, s16, 1.0
-; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
-; SI-NEXT:    v_alignbit_b32 v5, s4, v4, 16
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-NEXT:    s_setpc_b64 s[30:31]
+; SI-NEXT:    v_add_f32_e64 v8, s17, 1.0
+; SI-NEXT:    v_add_f32_e64 v7, s16, 1.0
+; SI-NEXT:    v_lshr_b64 v[1:2], v[7:8], 16
+; SI-NEXT:    v_lshr_b64 v[5:6], v[4:5], 16
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
+; SI-NEXT:    s_branch .LBB33_5
 ; SI-NEXT:  .LBB33_3:
-; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr5
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr4
 ; SI-NEXT:    s_branch .LBB33_2
 ; SI-NEXT:  .LBB33_4:
-; SI-NEXT:    v_mov_b32_e32 v0, s16
-; SI-NEXT:    v_mov_b32_e32 v2, s17
+; SI-NEXT:    v_mov_b32_e32 v7, s16
+; SI-NEXT:    v_mov_b32_e32 v8, s17
 ; SI-NEXT:    v_mov_b32_e32 v4, s18
-; SI-NEXT:    v_mov_b32_e32 v3, s6
+; SI-NEXT:    v_mov_b32_e32 v3, s10
+; SI-NEXT:    v_mov_b32_e32 v5, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s6
+; SI-NEXT:  .LBB33_5: ; %end
+; SI-NEXT:    v_mov_b32_e32 v0, v7
+; SI-NEXT:    v_mov_b32_e32 v2, v8
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: bitcast_v3f32_to_v6i16_scalar:
@@ -7981,62 +7993,64 @@ define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i3
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_cmp_lg_u32 s22, 0
-; SI-NEXT:    v_mul_f32_e64 v16, 1.0, s17
-; SI-NEXT:    v_mul_f32_e64 v17, 1.0, s16
-; SI-NEXT:    v_mul_f32_e64 v14, 1.0, s19
-; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v12, 1.0, s21
-; SI-NEXT:    v_mul_f32_e64 v13, 1.0, s20
+; SI-NEXT:    v_mul_f32_e64 v17, 1.0, s17
+; SI-NEXT:    v_mul_f32_e64 v18, 1.0, s16
+; SI-NEXT:    v_mul_f32_e64 v15, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v16, 1.0, s18
+; SI-NEXT:    v_mul_f32_e64 v0, 1.0, s21
+; SI-NEXT:    v_mul_f32_e64 v14, 1.0, s20
 ; SI-NEXT:    s_cbranch_scc0 .LBB39_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v12
-; SI-NEXT:    v_alignbit_b32 v0, v0, v17, 16
-; SI-NEXT:    v_alignbit_b32 v4, v6, v15, 16
-; SI-NEXT:    v_alignbit_b32 v8, v10, v13, 16
-; SI-NEXT:    v_alignbit_b32 v3, v4, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, v4, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, v4, v0, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v14
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 24, v12
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v17
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v15
+; SI-NEXT:    v_alignbit_b32 v12, v1, v18, 16
+; SI-NEXT:    v_alignbit_b32 v13, v6, v16, 16
+; SI-NEXT:    v_lshr_b64 v[3:4], v[12:13], 24
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
+; SI-NEXT:    v_lshr_b64 v[4:5], v[12:13], 16
+; SI-NEXT:    v_alignbit_b32 v8, v10, v14, 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[12:13], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 24, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v13
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
 ; SI-NEXT:    s_cbranch_execnz .LBB39_3
 ; SI-NEXT:  .LBB39_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
-; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v17
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v17
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v18
+; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v14
-; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v15
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_alignbit_b32 v12, v2, v1, 16
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v15
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
 ; SI-NEXT:    v_add_f32_e32 v7, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
-; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v12
-; SI-NEXT:    v_alignbit_b32 v4, v6, v1, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v13
-; SI-NEXT:    v_add_f32_e32 v11, 0x40c00000, v2
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; SI-NEXT:    v_alignbit_b32 v13, v6, v1, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v14
+; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
 ; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
+; SI-NEXT:    v_lshr_b64 v[3:4], v[12:13], 24
 ; SI-NEXT:    v_alignbit_b32 v8, v10, v1, 16
-; SI-NEXT:    v_alignbit_b32 v3, v4, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, v4, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, v4, v0, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
+; SI-NEXT:    v_lshr_b64 v[4:5], v[12:13], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[12:13], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v13
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 24, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v11, 24, v11
+; SI-NEXT:    v_lshrrev_b32_e32 v11, 24, v0
 ; SI-NEXT:  .LBB39_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v0, v12
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:    v_mov_b32_e32 v4, v13
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB39_4:
-; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr12
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr7
@@ -9524,69 +9538,71 @@ define inreg <12 x i8> @bitcast_v6f16_to_v12i8_scalar(<6 x half> inreg %a, i32 i
 ; SI-LABEL: bitcast_v6f16_to_v12i8_scalar:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_cvt_f16_f32_e32 v15, s17
-; SI-NEXT:    v_cvt_f16_f32_e32 v14, s16
+; SI-NEXT:    v_cvt_f16_f32_e32 v16, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, s16
 ; SI-NEXT:    v_cvt_f16_f32_e32 v6, s19
-; SI-NEXT:    v_cvt_f16_f32_e32 v13, s18
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, s18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, s21
-; SI-NEXT:    v_cvt_f16_f32_e32 v12, s20
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, s20
 ; SI-NEXT:    s_cmp_lg_u32 s22, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB43_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v15
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v16
+; SI-NEXT:    v_or_b32_e32 v12, v15, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v10
-; SI-NEXT:    v_or_b32_e32 v0, v14, v0
-; SI-NEXT:    v_or_b32_e32 v4, v13, v1
-; SI-NEXT:    v_or_b32_e32 v8, v12, v7
-; SI-NEXT:    v_alignbit_b32 v3, v4, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, v4, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, v4, v0, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
+; SI-NEXT:    v_or_b32_e32 v13, v14, v1
+; SI-NEXT:    v_lshr_b64 v[1:2], v[12:13], 8
+; SI-NEXT:    v_lshr_b64 v[3:4], v[12:13], 24
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v10
+; SI-NEXT:    v_lshr_b64 v[4:5], v[12:13], 16
+; SI-NEXT:    v_or_b32_e32 v8, v0, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v13
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
 ; SI-NEXT:    v_bfe_u32 v7, v6, 8, 8
 ; SI-NEXT:    v_bfe_u32 v11, v10, 8, 8
 ; SI-NEXT:    s_cbranch_execnz .LBB43_3
 ; SI-NEXT:  .LBB43_2: ; %cmp.true
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v15
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v14
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, v6
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v13
-; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v16
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v14
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v6, v2
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v12, v2, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v10
-; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v3
+; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v10, v1
-; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; SI-NEXT:    v_or_b32_e32 v4, v2, v3
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v10
-; SI-NEXT:    v_or_b32_e32 v8, v1, v2
-; SI-NEXT:    v_alignbit_b32 v3, v4, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, v4, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, v4, v0, 8
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
+; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v6
+; SI-NEXT:    v_or_b32_e32 v13, v2, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v10
+; SI-NEXT:    v_lshr_b64 v[3:4], v[12:13], 24
+; SI-NEXT:    v_or_b32_e32 v8, v0, v1
+; SI-NEXT:    v_lshr_b64 v[4:5], v[12:13], 16
+; SI-NEXT:    v_lshr_b64 v[1:2], v[12:13], 8
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 8, v13
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 8, v8
 ; SI-NEXT:    v_bfe_u32 v7, v6, 8, 8
 ; SI-NEXT:    v_bfe_u32 v11, v10, 8, 8
 ; SI-NEXT:  .LBB43_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v0, v12
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:    v_mov_b32_e32 v4, v13
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB43_4:
-; SI-NEXT:    ; implicit-def: $vgpr0
+; SI-NEXT:    ; implicit-def: $vgpr12
 ; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr4
+; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr5
 ; SI-NEXT:    ; implicit-def: $vgpr7
 ; SI-NEXT:    ; implicit-def: $vgpr8
@@ -10274,37 +10290,37 @@ define inreg <6 x i16> @bitcast_v12i8_to_v6i16_scalar(<12 x i8> inreg %a, i32 in
 ; SI-NEXT:    s_cmp_lg_u32 s28, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB45_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
-; SI-NEXT:    s_and_b32 s4, s20, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s21, 8
+; SI-NEXT:    s_and_b32 s4, s16, 0xff
+; SI-NEXT:    s_lshl_b32 s5, s17, 8
 ; SI-NEXT:    s_or_b32 s4, s4, s5
-; SI-NEXT:    s_and_b32 s5, s22, 0xff
+; SI-NEXT:    s_and_b32 s5, s18, 0xff
 ; SI-NEXT:    s_lshl_b32 s5, s5, 16
-; SI-NEXT:    s_lshl_b32 s6, s23, 24
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s6, s5
-; SI-NEXT:    s_or_b32 s7, s4, s5
-; SI-NEXT:    s_and_b32 s4, s18, 0xff
-; SI-NEXT:    s_lshl_b32 s4, s4, 16
 ; SI-NEXT:    s_lshl_b32 s6, s19, 24
-; SI-NEXT:    s_or_b32 s4, s6, s4
-; SI-NEXT:    s_and_b32 s6, s16, 0xff
-; SI-NEXT:    s_lshl_b32 s8, s17, 8
-; SI-NEXT:    s_or_b32 s6, s6, s8
-; SI-NEXT:    s_and_b32 s6, s6, 0xffff
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    s_or_b32 s6, s6, s4
-; SI-NEXT:    s_and_b32 s4, s24, 0xff
-; SI-NEXT:    s_lshl_b32 s8, s25, 8
-; SI-NEXT:    s_or_b32 s4, s4, s8
-; SI-NEXT:    s_and_b32 s8, s26, 0xff
-; SI-NEXT:    s_lshl_b32 s8, s8, 16
-; SI-NEXT:    s_lshl_b32 s9, s27, 24
+; SI-NEXT:    s_or_b32 s12, s6, s5
+; SI-NEXT:    s_and_b32 s5, s24, 0xff
+; SI-NEXT:    s_lshl_b32 s6, s25, 8
+; SI-NEXT:    s_or_b32 s5, s5, s6
+; SI-NEXT:    s_and_b32 s6, s26, 0xff
+; SI-NEXT:    s_lshl_b32 s6, s6, 16
+; SI-NEXT:    s_lshl_b32 s7, s27, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s8, s7, s6
+; SI-NEXT:    s_or_b32 s10, s5, s8
+; SI-NEXT:    s_and_b32 s5, s20, 0xff
+; SI-NEXT:    s_lshl_b32 s6, s21, 8
+; SI-NEXT:    s_or_b32 s5, s5, s6
+; SI-NEXT:    s_and_b32 s6, s22, 0xff
+; SI-NEXT:    s_lshl_b32 s6, s6, 16
+; SI-NEXT:    s_lshl_b32 s7, s23, 24
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s9, s7, s6
+; SI-NEXT:    s_or_b32 s13, s5, s9
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s10, s9, s8
-; SI-NEXT:    v_alignbit_b32 v1, s7, v0, 16
-; SI-NEXT:    s_or_b32 s8, s4, s10
-; SI-NEXT:    s_lshr_b32 s9, s5, 16
-; SI-NEXT:    s_lshr_b32 s10, s10, 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[12:13], 16
+; SI-NEXT:    s_or_b32 s4, s4, s12
+; SI-NEXT:    s_lshr_b32 s7, s9, 16
+; SI-NEXT:    s_lshr_b32 s11, s8, 16
+; SI-NEXT:    s_mov_b32 s5, s13
 ; SI-NEXT:    s_cbranch_execnz .LBB45_3
 ; SI-NEXT:  .LBB45_2: ; %cmp.true
 ; SI-NEXT:    s_add_i32 s16, s16, 3
@@ -10318,52 +10334,51 @@ define inreg <6 x i16> @bitcast_v12i8_to_v6i16_scalar(<12 x i8> inreg %a, i32 in
 ; SI-NEXT:    s_lshl_b32 s6, s6, 16
 ; SI-NEXT:    s_and_b32 s4, s4, 0xffff
 ; SI-NEXT:    s_or_b32 s5, s5, s6
-; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s20, s20, 3
-; SI-NEXT:    s_add_i32 s6, s4, 0x3000000
-; SI-NEXT:    s_and_b32 s4, s20, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s21, 8
-; SI-NEXT:    s_add_i32 s22, s22, 3
 ; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    s_and_b32 s5, s20, 0xff
+; SI-NEXT:    s_lshl_b32 s6, s21, 8
+; SI-NEXT:    s_add_i32 s22, s22, 3
+; SI-NEXT:    s_or_b32 s5, s6, s5
 ; SI-NEXT:    s_and_b32 s7, s22, 0xff
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s23, 24
+; SI-NEXT:    s_addk_i32 s5, 0x300
+; SI-NEXT:    s_lshl_b32 s6, s23, 24
 ; SI-NEXT:    s_lshl_b32 s7, s7, 16
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s7
-; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    s_and_b32 s5, s5, 0xffff
+; SI-NEXT:    s_or_b32 s6, s6, s7
 ; SI-NEXT:    s_add_i32 s24, s24, 3
-; SI-NEXT:    s_add_i32 s7, s4, 0x3000000
-; SI-NEXT:    s_and_b32 s4, s24, 0xff
-; SI-NEXT:    s_lshl_b32 s5, s25, 8
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    s_and_b32 s6, s24, 0xff
+; SI-NEXT:    s_lshl_b32 s7, s25, 8
 ; SI-NEXT:    s_add_i32 s26, s26, 3
-; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    s_or_b32 s6, s7, s6
 ; SI-NEXT:    s_and_b32 s8, s26, 0xff
-; SI-NEXT:    s_addk_i32 s4, 0x300
-; SI-NEXT:    s_lshl_b32 s5, s27, 24
+; SI-NEXT:    s_addk_i32 s6, 0x300
+; SI-NEXT:    s_lshl_b32 s7, s27, 24
 ; SI-NEXT:    s_lshl_b32 s8, s8, 16
-; SI-NEXT:    s_and_b32 s4, s4, 0xffff
-; SI-NEXT:    s_or_b32 s5, s5, s8
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s8, s4, 0x3000000
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_alignbit_b32 v1, s7, v0, 16
-; SI-NEXT:    s_lshr_b32 s9, s7, 16
-; SI-NEXT:    s_lshr_b32 s10, s8, 16
+; SI-NEXT:    s_and_b32 s6, s6, 0xffff
+; SI-NEXT:    s_or_b32 s7, s7, s8
+; SI-NEXT:    s_add_i32 s4, s4, 0x3000000
+; SI-NEXT:    s_add_i32 s5, s5, 0x3000000
+; SI-NEXT:    s_or_b32 s6, s7, s6
+; SI-NEXT:    s_add_i32 s10, s6, 0x3000000
+; SI-NEXT:    s_lshr_b64 s[6:7], s[4:5], 16
+; SI-NEXT:    s_lshr_b32 s7, s5, 16
+; SI-NEXT:    s_lshr_b32 s11, s10, 16
 ; SI-NEXT:  .LBB45_3: ; %end
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_mov_b32_e32 v2, s7
-; SI-NEXT:    v_mov_b32_e32 v3, s9
-; SI-NEXT:    v_mov_b32_e32 v4, s8
-; SI-NEXT:    v_mov_b32_e32 v5, s10
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s6
+; SI-NEXT:    v_mov_b32_e32 v2, s5
+; SI-NEXT:    v_mov_b32_e32 v3, s7
+; SI-NEXT:    v_mov_b32_e32 v4, s10
+; SI-NEXT:    v_mov_b32_e32 v5, s11
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB45_4:
+; SI-NEXT:    ; implicit-def: $sgpr4
 ; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr1
 ; SI-NEXT:    ; implicit-def: $sgpr7
-; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $sgpr8
 ; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr11
 ; SI-NEXT:    s_branch .LBB45_2
 ;
 ; VI-LABEL: bitcast_v12i8_to_v6i16_scalar:
@@ -10954,74 +10969,74 @@ define inreg <12 x i8> @bitcast_v6i16_to_v12i8_scalar(<6 x i16> inreg %a, i32 in
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s17, 16
-; SI-NEXT:    s_or_b32 s6, s4, s5
-; SI-NEXT:    s_and_b32 s4, s18, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s19, 16
-; SI-NEXT:    s_or_b32 s7, s4, s5
-; SI-NEXT:    s_and_b32 s4, s20, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s21, 16
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    s_or_b32 s8, s4, s5
-; SI-NEXT:    v_alignbit_b32 v3, s7, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, s7, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, s7, v0, 8
-; SI-NEXT:    s_lshr_b32 s9, s7, 8
-; SI-NEXT:    s_lshr_b32 s12, s8, 8
-; SI-NEXT:    s_and_b32 s10, s19, 0xffff
-; SI-NEXT:    s_and_b32 s13, s21, 0xffff
+; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    s_and_b32 s5, s18, 0xffff
+; SI-NEXT:    s_lshl_b32 s6, s19, 16
+; SI-NEXT:    s_or_b32 s5, s5, s6
+; SI-NEXT:    s_lshr_b64 s[8:9], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[4:5], 8
+; SI-NEXT:    s_and_b32 s9, s20, 0xffff
+; SI-NEXT:    s_lshl_b32 s11, s21, 16
+; SI-NEXT:    s_lshr_b64 s[6:7], s[4:5], 24
+; SI-NEXT:    s_or_b32 s14, s9, s11
+; SI-NEXT:    s_lshr_b32 s7, s5, 8
+; SI-NEXT:    s_lshr_b32 s15, s14, 8
+; SI-NEXT:    s_and_b32 s9, s19, 0xffff
+; SI-NEXT:    s_and_b32 s22, s21, 0xffff
 ; SI-NEXT:    s_bfe_u32 s11, s19, 0x80008
-; SI-NEXT:    s_bfe_u32 s14, s21, 0x80008
+; SI-NEXT:    s_bfe_u32 s23, s21, 0x80008
 ; SI-NEXT:    s_cbranch_execnz .LBB47_3
 ; SI-NEXT:  .LBB47_2: ; %cmp.true
 ; SI-NEXT:    s_add_i32 s16, s16, 3
 ; SI-NEXT:    s_and_b32 s4, s16, 0xffff
 ; SI-NEXT:    s_lshl_b32 s5, s17, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
 ; SI-NEXT:    s_add_i32 s18, s18, 3
-; SI-NEXT:    s_add_i32 s6, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s18, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s19, 16
 ; SI-NEXT:    s_or_b32 s4, s5, s4
+; SI-NEXT:    s_and_b32 s5, s18, 0xffff
+; SI-NEXT:    s_lshl_b32 s6, s19, 16
 ; SI-NEXT:    s_add_i32 s20, s20, 3
-; SI-NEXT:    s_add_i32 s7, s4, 0x30000
-; SI-NEXT:    s_and_b32 s4, s20, 0xffff
-; SI-NEXT:    s_lshl_b32 s5, s21, 16
-; SI-NEXT:    s_or_b32 s4, s5, s4
-; SI-NEXT:    s_add_i32 s8, s4, 0x30000
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_alignbit_b32 v3, s7, v0, 24
-; SI-NEXT:    v_alignbit_b32 v2, s7, v0, 16
-; SI-NEXT:    v_alignbit_b32 v1, s7, v0, 8
-; SI-NEXT:    s_lshr_b32 s11, s7, 24
-; SI-NEXT:    s_lshr_b32 s10, s7, 16
-; SI-NEXT:    s_lshr_b32 s9, s7, 8
-; SI-NEXT:    s_lshr_b32 s14, s8, 24
-; SI-NEXT:    s_lshr_b32 s13, s8, 16
-; SI-NEXT:    s_lshr_b32 s12, s8, 8
+; SI-NEXT:    s_or_b32 s5, s6, s5
+; SI-NEXT:    s_and_b32 s6, s20, 0xffff
+; SI-NEXT:    s_lshl_b32 s7, s21, 16
+; SI-NEXT:    s_add_i32 s4, s4, 0x30000
+; SI-NEXT:    s_add_i32 s5, s5, 0x30000
+; SI-NEXT:    s_or_b32 s6, s7, s6
+; SI-NEXT:    s_add_i32 s14, s6, 0x30000
+; SI-NEXT:    s_lshr_b64 s[6:7], s[4:5], 24
+; SI-NEXT:    s_lshr_b64 s[8:9], s[4:5], 16
+; SI-NEXT:    s_lshr_b64 s[10:11], s[4:5], 8
+; SI-NEXT:    s_lshr_b32 s11, s5, 24
+; SI-NEXT:    s_lshr_b32 s9, s5, 16
+; SI-NEXT:    s_lshr_b32 s7, s5, 8
+; SI-NEXT:    s_lshr_b32 s23, s14, 24
+; SI-NEXT:    s_lshr_b32 s22, s14, 16
+; SI-NEXT:    s_lshr_b32 s15, s14, 8
 ; SI-NEXT:  .LBB47_3: ; %end
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_mov_b32_e32 v4, s7
-; SI-NEXT:    v_mov_b32_e32 v5, s9
-; SI-NEXT:    v_mov_b32_e32 v6, s10
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s10
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    v_mov_b32_e32 v3, s6
+; SI-NEXT:    v_mov_b32_e32 v4, s5
+; SI-NEXT:    v_mov_b32_e32 v5, s7
+; SI-NEXT:    v_mov_b32_e32 v6, s9
 ; SI-NEXT:    v_mov_b32_e32 v7, s11
-; SI-NEXT:    v_mov_b32_e32 v8, s8
-; SI-NEXT:    v_mov_b32_e32 v9, s12
-; SI-NEXT:    v_mov_b32_e32 v10, s13
-; SI-NEXT:    v_mov_b32_e32 v11, s14
+; SI-NEXT:    v_mov_b32_e32 v8, s14
+; SI-NEXT:    v_mov_b32_e32 v9, s15
+; SI-NEXT:    v_mov_b32_e32 v10, s22
+; SI-NEXT:    v_mov_b32_e32 v11, s23
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB47_4:
+; SI-NEXT:    ; implicit-def: $sgpr4
+; SI-NEXT:    ; implicit-def: $sgpr10
+; SI-NEXT:    ; implicit-def: $sgpr8
 ; SI-NEXT:    ; implicit-def: $sgpr6
-; SI-NEXT:    ; implicit-def: $vgpr1
-; SI-NEXT:    ; implicit-def: $vgpr2
-; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $sgpr7
 ; SI-NEXT:    ; implicit-def: $sgpr9
-; SI-NEXT:    ; implicit-def: $sgpr10
 ; SI-NEXT:    ; implicit-def: $sgpr11
-; SI-NEXT:    ; implicit-def: $sgpr8
-; SI-NEXT:    ; implicit-def: $sgpr12
-; SI-NEXT:    ; implicit-def: $sgpr13
 ; SI-NEXT:    ; implicit-def: $sgpr14
+; SI-NEXT:    ; implicit-def: $sgpr15
+; SI-NEXT:    ; implicit-def: $sgpr22
+; SI-NEXT:    ; implicit-def: $sgpr23
 ; SI-NEXT:    s_branch .LBB47_2
 ;
 ; VI-LABEL: bitcast_v6i16_to_v12i8_scalar:
@@ -12541,44 +12556,45 @@ define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i3
 ; SI-NEXT:    v_mul_f32_e64 v11, 1.0, s16
 ; SI-NEXT:    v_mul_f32_e64 v10, 1.0, s17
 ; SI-NEXT:    v_mul_f32_e64 v7, 1.0, s18
-; SI-NEXT:    v_mul_f32_e64 v6, 1.0, s19
+; SI-NEXT:    v_mul_f32_e64 v1, 1.0, s19
 ; SI-NEXT:    v_mul_f32_e64 v9, 1.0, s20
 ; SI-NEXT:    v_mul_f32_e64 v8, 1.0, s21
 ; SI-NEXT:    s_cbranch_scc0 .LBB53_4
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
-; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v10
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v10
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v9
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
 ; SI-NEXT:    s_cbranch_execnz .LBB53_3
 ; SI-NEXT:  .LBB53_2: ; %cmp.true
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v10
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v10
 ; SI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v11
-; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
+; SI-NEXT:    v_add_f32_e32 v4, 0x40c00000, v2
 ; SI-NEXT:    v_add_f32_e32 v0, 0x40c00000, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
 ; SI-NEXT:    v_alignbit_b32 v0, v2, v0, 16
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v9
-; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_add_f32_e32 v9, 0x40c00000, v2
+; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v8
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; SI-NEXT:    v_and_b32_e32 v3, 0xffff0000, v6
-; SI-NEXT:    v_alignbit_b32 v4, v5, v2, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
-; SI-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; SI-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; SI-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; SI-NEXT:    v_alignbit_b32 v2, v3, v2, 16
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; SI-NEXT:    v_lshr_b64 v[6:7], v[1:2], 16
+; SI-NEXT:    v_alignbit_b32 v4, v5, v9, 16
 ; SI-NEXT:  .LBB53_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v1, v6
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB53_4:
 ; SI-NEXT:    ; implicit-def: $vgpr0
-; SI-NEXT:    ; implicit-def: $vgpr1
+; SI-NEXT:    ; implicit-def: $vgpr6
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    ; implicit-def: $vgpr3
 ; SI-NEXT:    ; implicit-def: $vgpr4
@@ -13210,7 +13226,7 @@ define inreg <6 x i16> @bitcast_v6f16_to_v6i16_scalar(<6 x half> inreg %a, i32 i
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, s16
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, s17
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, s17
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, s18
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, s19
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, s20
@@ -13220,32 +13236,33 @@ define inreg <6 x i16> @bitcast_v6f16_to_v6i16_scalar(<6 x half> inreg %a, i32 i
 ; SI-NEXT:  ; %bb.1: ; %cmp.false
 ; SI-NEXT:    s_cbranch_execnz .LBB57_3
 ; SI-NEXT:  .LBB57_2: ; %cmp.true
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v6
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; SI-NEXT:    v_add_f32_e32 v3, 0x38000000, v3
 ; SI-NEXT:    v_add_f32_e32 v1, 0x38000000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; SI-NEXT:    v_add_f32_e32 v2, 0x38000000, v2
-; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_add_f32_e32 v5, 0x38000000, v5
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_add_f32_e32 v0, 0x38000000, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_add_f32_e32 v4, 0x38000000, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; SI-NEXT:    v_or_b32_e32 v4, v4, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v6
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; SI-NEXT:    v_lshr_b64 v[6:7], v[1:2], 16
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
-; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
+; SI-NEXT:    v_or_b32_e32 v4, v4, v8
 ; SI-NEXT:  .LBB57_3: ; %end
+; SI-NEXT:    v_mov_b32_e32 v1, v6
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ; SI-NEXT:  .LBB57_4:
 ; SI-NEXT:    s_branch .LBB57_2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index b69afb8e301bb..e27164c2d6d69 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -4610,50 +4610,48 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_lshr_b64 s[6:7], s[10:11], 30
+; GFX6-NEXT:    s_mov_b32 s0, s8
+; GFX6-NEXT:    s_and_b32 s8, s6, 0x7fff
+; GFX6-NEXT:    s_and_b32 s6, s4, 0x7fff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX6-NEXT:    s_and_b32 s6, s10, 0x7fff
-; GFX6-NEXT:    s_and_b32 s7, s4, 0x7fff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX6-NEXT:    v_mov_b32_e32 v2, s4
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s6
+; GFX6-NEXT:    s_lshr_b64 s[6:7], s[4:5], 30
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GFX6-NEXT:    s_bfe_u32 s4, s4, 0xf000f
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s6
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v1
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
-; GFX6-NEXT:    s_bfe_u32 s7, s10, 0xf000f
-; GFX6-NEXT:    v_alignbit_b32 v2, s5, v2, 30
-; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s7
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
-; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
-; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX6-NEXT:    v_mad_f32 v3, -v4, v1, v3
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX6-NEXT:    v_mov_b32_e32 v0, s10
-; GFX6-NEXT:    v_alignbit_b32 v0, s11, v0, 30
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
-; GFX6-NEXT:    v_mul_f32_e32 v1, v6, v7
-; GFX6-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GFX6-NEXT:    v_mad_f32 v4, -v1, v5, v6
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v2
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v5
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v1, v0, v6
-; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v1
-; GFX6-NEXT:    v_mad_f32 v0, -v1, v2, v0
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v3
-; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v5, vcc
-; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s4
+; GFX6-NEXT:    s_mov_b32 s1, s9
+; GFX6-NEXT:    s_bfe_u32 s9, s10, 0xf000f
+; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s9
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
+; GFX6-NEXT:    s_and_b32 s5, s6, 0x7fff
+; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s5
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX6-NEXT:    v_mul_f32_e32 v0, v4, v5
+; GFX6-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX6-NEXT:    v_mad_f32 v4, -v0, v3, v4
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s8
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v1
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v3
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v0, vcc
+; GFX6-NEXT:    v_mul_f32_e32 v0, v5, v6
+; GFX6-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v0
+; GFX6-NEXT:    v_mad_f32 v0, -v0, v1, v5
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v1
+; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
+; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
+; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
 ; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX6-NEXT:    s_mov_b32 s0, s8
-; GFX6-NEXT:    s_mov_b32 s1, s9
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -4667,46 +4665,44 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    s_and_b32 s5, s6, 0x7fff
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[2:3], 30
+; GFX9-NEXT:    s_and_b32 s3, s6, 0x7fff
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT:    s_and_b32 s5, s2, 0x7fff
+; GFX9-NEXT:    s_bfe_u32 s8, s2, 0xf000f
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[6:7], 30
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s5
-; GFX9-NEXT:    s_and_b32 s4, s2, 0x7fff
-; GFX9-NEXT:    v_alignbit_b32 v0, s3, v0, 30
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
 ; GFX9-NEXT:    s_bfe_u32 s3, s6, 0xf000f
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s4
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s3
-; GFX9-NEXT:    s_bfe_u32 s2, s2, 0xf000f
-; GFX9-NEXT:    v_mov_b32_e32 v3, s6
-; GFX9-NEXT:    v_alignbit_b32 v3, s7, v3, 30
-; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s2
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v6
-; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
-; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX9-NEXT:    v_mad_f32 v4, -v5, v1, v4
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v1
-; GFX9-NEXT:    v_mul_f32_e32 v1, v7, v8
-; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_mad_f32 v5, -v1, v6, v7
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v3
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v1, v0, v7
-; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v1
-; GFX9-NEXT:    v_mad_f32 v0, -v1, v3, v0
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v3
-; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
-; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff, v5
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s8
+; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
+; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
+; GFX9-NEXT:    s_and_b32 s2, s2, 0x7fff
+; GFX9-NEXT:    v_mad_f32 v1, -v3, v0, v1
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX9-NEXT:    v_mul_f32_e32 v0, v5, v6
+; GFX9-NEXT:    s_and_b32 s4, s4, 0x7fff
+; GFX9-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX9-NEXT:    v_mad_f32 v5, -v0, v4, v5
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s4
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v1
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v0, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, v6, v7
+; GFX9-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v0
+; GFX9-NEXT:    v_mad_f32 v0, -v0, v1, v6
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v1
+; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v5, vcc
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
 ; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
@@ -4797,58 +4793,56 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_lshr_b64 s[6:7], s[10:11], 30
 ; GFX6-NEXT:    s_mov_b32 s0, s8
-; GFX6-NEXT:    s_and_b32 s8, s4, 0x7fff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s8
-; GFX6-NEXT:    s_and_b32 s7, s10, 0x7fff
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s7
-; GFX6-NEXT:    v_mov_b32_e32 v2, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v1
-; GFX6-NEXT:    v_alignbit_b32 v2, s5, v2, 30
-; GFX6-NEXT:    s_bfe_u32 s5, s4, 0xf000f
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s5
-; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
-; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX6-NEXT:    v_mad_f32 v3, -v4, v1, v3
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
-; GFX6-NEXT:    s_bfe_u32 s8, s10, 0xf000f
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s8
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v4, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v5
-; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
-; GFX6-NEXT:    v_mov_b32_e32 v0, s10
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s10, v1
-; GFX6-NEXT:    v_mul_f32_e32 v1, v3, v4
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, v2
-; GFX6-NEXT:    v_alignbit_b32 v0, s11, v0, 30
-; GFX6-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, v0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v4
-; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    v_mad_f32 v3, -v1, v5, v3
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
-; GFX6-NEXT:    v_mul_f32_e32 v3, v7, v8
-; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v3
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_mad_f32 v3, -v3, v4, v7
+; GFX6-NEXT:    s_and_b32 s8, s6, 0x7fff
+; GFX6-NEXT:    s_and_b32 s6, s4, 0x7fff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX6-NEXT:    s_and_b32 s6, s10, 0x7fff
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s6
+; GFX6-NEXT:    s_lshr_b64 s[6:7], s[4:5], 30
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX6-NEXT:    s_and_b32 s5, s6, 0x7fff
+; GFX6-NEXT:    s_bfe_u32 s6, s4, 0xf000f
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s6
+; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX6-NEXT:    s_bfe_u32 s11, s10, 0xf000f
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s11
+; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s4
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v3
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s8
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, 15
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s4
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, v3, v2
-; GFX6-NEXT:    s_lshr_b32 s6, s10, 15
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s6, v1
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
-; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
-; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
-; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s10, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, v1, v2
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s5
+; GFX6-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX6-NEXT:    v_mad_f32 v1, -v0, v3, v1
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v2
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v3
 ; GFX6-NEXT:    s_mov_b32 s1, s9
+; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v0, vcc
+; GFX6-NEXT:    v_mul_f32_e32 v1, v5, v6
+; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
+; GFX6-NEXT:    v_mad_f32 v1, -v1, v2, v5
+; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v2
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s4
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s5
+; GFX6-NEXT:    s_lshr_b32 s9, s10, 15
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s9, v0
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s8, v1
+; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
+; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
+; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 15, v2
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -4862,54 +4856,52 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_alignbit_b32 v0, s3, v0, 30
-; GFX9-NEXT:    s_and_b32 s3, s6, 0x7fff
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[2:3], 30
+; GFX9-NEXT:    s_and_b32 s5, s6, 0x7fff
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s5
+; GFX9-NEXT:    s_and_b32 s3, s2, 0x7fff
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GFX9-NEXT:    s_and_b32 s4, s2, 0x7fff
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s4
-; GFX9-NEXT:    s_bfe_u32 s4, s6, 0xf000f
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s4
-; GFX9-NEXT:    v_mov_b32_e32 v3, s6
-; GFX9-NEXT:    v_alignbit_b32 v3, s7, v3, 30
-; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
-; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX9-NEXT:    v_mad_f32 v4, -v5, v1, v4
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX9-NEXT:    s_bfe_u32 s5, s2, 0xf000f
-; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v1
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s5
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v3
-; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX9-NEXT:    v_mul_f32_e32 v4, v7, v8
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, v0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v9, v5
+; GFX9-NEXT:    s_and_b32 s8, s4, 0x7fff
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[6:7], 30
+; GFX9-NEXT:    s_bfe_u32 s5, s6, 0xf000f
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s5
+; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
+; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX9-NEXT:    v_mad_f32 v1, -v3, v0, v1
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX9-NEXT:    s_bfe_u32 s9, s2, 0xf000f
+; GFX9-NEXT:    s_and_b32 s3, s4, 0x7fff
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s9
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s3
+; GFX9-NEXT:    v_mul_f32_e32 v1, v5, v6
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s8
+; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v3
+; GFX9-NEXT:    v_mad_f32 v5, -v1, v4, v5
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v4
+; GFX9-NEXT:    v_mul_f32_e32 v4, v6, v7
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX9-NEXT:    v_mad_f32 v7, -v4, v6, v7
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, v6
-; GFX9-NEXT:    v_mul_f32_e32 v6, v8, v9
-; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_mad_f32 v6, -v6, v5, v8
-; GFX9-NEXT:    s_lshr_b32 s3, s6, 15
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, v5
-; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s3
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s6
-; GFX9-NEXT:    v_mul_lo_u32 v3, v5, v3
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_mad_f32 v4, -v4, v3, v6
+; GFX9-NEXT:    s_lshr_b32 s4, s6, 15
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v3
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s6
+; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s4
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s3
 ; GFX9-NEXT:    s_lshr_b32 s3, s2, 15
-; GFX9-NEXT:    v_sub_u32_e32 v4, s3, v4
-; GFX9-NEXT:    v_sub_u32_e32 v5, s2, v1
-; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
-; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff, v4
+; GFX9-NEXT:    v_sub_u32_e32 v4, s2, v0
+; GFX9-NEXT:    v_sub_u32_e32 v5, s3, v1
+; GFX9-NEXT:    v_sub_u32_e32 v0, s8, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff, v5
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
-; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
 ; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
@@ -5006,64 +4998,63 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, s10
-; GFX6-NEXT:    s_bfe_i32 s6, s4, 0xf0000
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s6
-; GFX6-NEXT:    v_mov_b32_e32 v1, s4
-; GFX6-NEXT:    v_alignbit_b32 v1, s5, v1, 30
+; GFX6-NEXT:    s_lshr_b64 s[6:7], s[10:11], 30
+; GFX6-NEXT:    s_bfe_i32 s7, s4, 0xf0000
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s7
+; GFX6-NEXT:    s_mov_b32 s0, s8
+; GFX6-NEXT:    s_mov_b32 s1, s9
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[4:5], 30
 ; GFX6-NEXT:    s_bfe_i32 s5, s10, 0xf0000
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s5
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
-; GFX6-NEXT:    s_xor_b32 s5, s5, s6
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX6-NEXT:    s_xor_b32 s5, s5, s7
 ; GFX6-NEXT:    s_ashr_i32 s5, s5, 30
 ; GFX6-NEXT:    s_or_b32 s5, s5, 1
-; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
-; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX6-NEXT:    v_mad_f32 v3, -v4, v2, v3
-; GFX6-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v3|, |v2|
-; GFX6-NEXT:    s_and_b64 s[6:7], s[6:7], exec
-; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
+; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[12:13], |v1|, |v0|
+; GFX6-NEXT:    s_and_b64 s[12:13], s[12:13], exec
+; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GFX6-NEXT:    s_cselect_b32 s5, s5, 0
 ; GFX6-NEXT:    s_bfe_i32 s4, s4, 0xf000f
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, s5, v4
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s5, v2
 ; GFX6-NEXT:    s_bfe_i32 s5, s10, 0xf000f
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s5
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 15
-; GFX6-NEXT:    v_mul_f32_e32 v5, v4, v5
-; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX6-NEXT:    v_mad_f32 v4, -v5, v2, v4
+; GFX6-NEXT:    s_or_b32 s7, s4, 1
+; GFX6-NEXT:    v_mul_f32_e32 v3, v1, v3
+; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX6-NEXT:    v_mad_f32 v1, -v3, v0, v1
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
+; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX6-NEXT:    s_cselect_b32 s4, s7, 0
+; GFX6-NEXT:    s_bfe_i32 s5, s8, 0xf0000
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s5
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, s4, v3
+; GFX6-NEXT:    s_bfe_i32 s4, s6, 0xf0000
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s4
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v0
+; GFX6-NEXT:    s_xor_b32 s4, s4, s5
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    s_or_b32 s6, s4, 1
-; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
-; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v4|, |v2|
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, v1
-; GFX6-NEXT:    v_alignbit_b32 v0, s11, v0, 30
+; GFX6-NEXT:    v_mul_f32_e32 v4, v1, v4
+; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX6-NEXT:    v_mad_f32 v1, -v4, v0, v1
+; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
 ; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
-; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 15
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, s4, v5
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, v0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v2
-; GFX6-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
-; GFX6-NEXT:    v_or_b32_e32 v0, 1, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, v5, v6
-; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX6-NEXT:    v_mad_f32 v5, -v1, v2, v5
-; GFX6-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v2|
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v3
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v4
+; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
+; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
 ; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX6-NEXT:    s_mov_b32 s0, s8
-; GFX6-NEXT:    s_mov_b32 s1, s9
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -5077,60 +5068,59 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    s_bfe_i32 s4, s6, 0xf0000
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s4
-; GFX9-NEXT:    v_alignbit_b32 v0, s3, v0, 30
-; GFX9-NEXT:    s_bfe_i32 s3, s2, 0xf0000
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v3
-; GFX9-NEXT:    s_xor_b32 s3, s3, s4
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[2:3], 30
+; GFX9-NEXT:    s_bfe_i32 s3, s6, 0xf0000
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX9-NEXT:    s_bfe_i32 s5, s2, 0xf0000
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s5
+; GFX9-NEXT:    s_xor_b32 s3, s5, s3
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
 ; GFX9-NEXT:    s_ashr_i32 s3, s3, 30
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[6:7], 30
 ; GFX9-NEXT:    s_or_b32 s3, s3, 1
-; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
-; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX9-NEXT:    v_mad_f32 v4, -v5, v3, v4
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v4|, |v3|
-; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
+; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX9-NEXT:    v_mad_f32 v1, -v3, v0, v1
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[10:11], |v1|, |v0|
+; GFX9-NEXT:    s_and_b64 s[10:11], s[10:11], exec
 ; GFX9-NEXT:    s_cselect_b32 s3, s3, 0
-; GFX9-NEXT:    s_bfe_i32 s4, s6, 0xf000f
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s4
+; GFX9-NEXT:    s_bfe_i32 s5, s6, 0xf000f
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s5
 ; GFX9-NEXT:    s_bfe_i32 s2, s2, 0xf000f
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    v_add_u32_e32 v4, s3, v5
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s2
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v3
-; GFX9-NEXT:    v_alignbit_b32 v1, s7, v1, 30
-; GFX9-NEXT:    s_xor_b32 s2, s2, s4
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s2
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
+; GFX9-NEXT:    s_xor_b32 s2, s2, s5
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX9-NEXT:    v_add_u32_e32 v3, s3, v3
+; GFX9-NEXT:    v_mul_f32_e32 v4, v1, v4
+; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX9-NEXT:    v_mad_f32 v1, -v4, v0, v1
+; GFX9-NEXT:    s_or_b32 s5, s2, 1
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
+; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
+; GFX9-NEXT:    s_cselect_b32 s2, s5, 0
+; GFX9-NEXT:    s_bfe_i32 s3, s8, 0xf0000
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX9-NEXT:    v_add_u32_e32 v4, s2, v4
+; GFX9-NEXT:    s_bfe_i32 s2, s4, 0xf0000
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s2
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v0
+; GFX9-NEXT:    s_xor_b32 s2, s2, s3
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
-; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
-; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
-; GFX9-NEXT:    v_mad_f32 v5, -v6, v3, v5
-; GFX9-NEXT:    v_bfe_i32 v1, v1, 0, 15
 ; GFX9-NEXT:    s_or_b32 s4, s2, 1
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v5|, |v3|
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v1
+; GFX9-NEXT:    v_mul_f32_e32 v5, v1, v5
+; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX9-NEXT:    v_mad_f32 v1, -v5, v0, v1
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
 ; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
 ; GFX9-NEXT:    s_cselect_b32 s2, s4, 0
-; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 15
-; GFX9-NEXT:    v_add_u32_e32 v5, s2, v6
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v6, v0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v3
-; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v1
-; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
-; GFX9-NEXT:    v_or_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, v6, v7
-; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v7, v1
-; GFX9-NEXT:    v_mad_f32 v1, -v1, v3, v6
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v3|
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX9-NEXT:    v_add_u32_e32 v0, v7, v0
-; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
-; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff, v5
+; GFX9-NEXT:    v_add_u32_e32 v0, s2, v5
+; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff, v4
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
 ; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
@@ -5233,74 +5223,73 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_bfe_i32 s6, s10, 0xf0000
-; GFX6-NEXT:    v_mov_b32_e32 v2, s4
-; GFX6-NEXT:    v_alignbit_b32 v2, s5, v2, 30
-; GFX6-NEXT:    s_bfe_i32 s5, s4, 0xf0000
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s5
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, s6
-; GFX6-NEXT:    s_xor_b32 s5, s6, s5
-; GFX6-NEXT:    s_ashr_i32 s5, s5, 30
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
 ; GFX6-NEXT:    s_mov_b32 s0, s8
 ; GFX6-NEXT:    s_mov_b32 s1, s9
-; GFX6-NEXT:    s_lshr_b32 s8, s10, 15
-; GFX6-NEXT:    v_mul_f32_e32 v6, v5, v6
-; GFX6-NEXT:    v_trunc_f32_e32 v6, v6
-; GFX6-NEXT:    v_mad_f32 v5, -v6, v4, v5
-; GFX6-NEXT:    v_cvt_i32_f32_e32 v6, v6
-; GFX6-NEXT:    s_lshr_b32 s9, s4, 15
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[4:5], 30
+; GFX6-NEXT:    s_bfe_i32 s5, s4, 0xf0000
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s5
+; GFX6-NEXT:    s_bfe_i32 s12, s10, 0xf0000
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s12
+; GFX6-NEXT:    s_xor_b32 s5, s12, s5
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX6-NEXT:    s_lshr_b64 s[6:7], s[10:11], 30
+; GFX6-NEXT:    s_ashr_i32 s5, s5, 30
+; GFX6-NEXT:    s_and_b32 s7, s6, 0x7fff
+; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
+; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX6-NEXT:    s_lshr_b32 s11, s10, 15
+; GFX6-NEXT:    s_and_b32 s9, s8, 0x7fff
+; GFX6-NEXT:    s_lshr_b32 s14, s4, 15
 ; GFX6-NEXT:    s_or_b32 s5, s5, 1
-; GFX6-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v5|, |v4|
-; GFX6-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[12:13], |v1|, |v0|
+; GFX6-NEXT:    s_and_b64 s[12:13], s[12:13], exec
 ; GFX6-NEXT:    s_cselect_b32 s5, s5, 0
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, s5, v6
-; GFX6-NEXT:    v_mul_lo_u32 v4, v4, s4
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s5, v2
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GFX6-NEXT:    s_bfe_i32 s4, s4, 0xf000f
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, s4
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s4
 ; GFX6-NEXT:    s_bfe_i32 s5, s10, 0xf000f
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v6, s5
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s10, v0
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s5
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
-; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v2
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
-; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 15
-; GFX6-NEXT:    v_mul_f32_e32 v7, v6, v7
-; GFX6-NEXT:    v_trunc_f32_e32 v7, v7
-; GFX6-NEXT:    v_mad_f32 v6, -v7, v5, v6
+; GFX6-NEXT:    s_or_b32 s10, s4, 1
+; GFX6-NEXT:    v_mul_f32_e32 v3, v0, v3
+; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX6-NEXT:    v_mad_f32 v0, -v3, v1, v0
+; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v0|, |v1|
+; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX6-NEXT:    s_cselect_b32 s4, s10, 0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v3
+; GFX6-NEXT:    s_bfe_i32 s4, s8, 0xf0000
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s4
+; GFX6-NEXT:    s_bfe_i32 s5, s6, 0xf0000
+; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s5
+; GFX6-NEXT:    s_xor_b32 s4, s5, s4
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v1
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    s_or_b32 s6, s4, 1
-; GFX6-NEXT:    v_cvt_i32_f32_e32 v7, v7
-; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v6|, |v5|
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v6, v2
-; GFX6-NEXT:    v_mov_b32_e32 v0, s10
-; GFX6-NEXT:    v_alignbit_b32 v0, s11, v0, 30
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s14
+; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
+; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX6-NEXT:    v_mad_f32 v3, -v4, v1, v3
+; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
+; GFX6-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, |v1|
 ; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GFX6-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX6-NEXT:    s_cselect_b32 s4, s6, 0
-; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 15
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, s4, v7
-; GFX6-NEXT:    v_cvt_f32_i32_e32 v7, v0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v6
-; GFX6-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s10, v4
-; GFX6-NEXT:    v_mul_f32_e32 v2, v7, v8
-; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX6-NEXT:    v_mad_f32 v7, -v2, v6, v7
-; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
-; GFX6-NEXT:    v_or_b32_e32 v0, 1, v0
-; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, |v6|
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v5, v5, s9
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, v3
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s8, v5
-; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s4, v4
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s9
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s11, v0
+; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s7, v1
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
-; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 15, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
+; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -5312,78 +5301,77 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_alignbit_b32 v0, s3, v0, 30
-; GFX9-NEXT:    s_bfe_i32 s3, s6, 0xf0000
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s3
-; GFX9-NEXT:    s_bfe_i32 s4, s2, 0xf0000
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s4
-; GFX9-NEXT:    s_xor_b32 s3, s4, s3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    s_ashr_i32 s3, s3, 30
-; GFX9-NEXT:    s_lshr_b32 s8, s2, 15
-; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
-; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
-; GFX9-NEXT:    v_mad_f32 v5, -v6, v4, v5
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
-; GFX9-NEXT:    v_alignbit_b32 v1, s7, v1, 30
-; GFX9-NEXT:    s_lshr_b32 s7, s6, 15
-; GFX9-NEXT:    s_or_b32 s3, s3, 1
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v5|, |v4|
-; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GFX9-NEXT:    s_cselect_b32 s3, s3, 0
-; GFX9-NEXT:    v_add_u32_e32 v4, s3, v6
-; GFX9-NEXT:    s_bfe_i32 s3, s6, 0xf000f
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s3
-; GFX9-NEXT:    s_bfe_i32 s4, s2, 0xf000f
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v6, s4
-; GFX9-NEXT:    s_xor_b32 s3, s4, s3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v5
-; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v1
-; GFX9-NEXT:    s_ashr_i32 s3, s3, 30
-; GFX9-NEXT:    v_bfe_i32 v1, v1, 0, 15
-; GFX9-NEXT:    v_mul_f32_e32 v7, v6, v7
-; GFX9-NEXT:    v_trunc_f32_e32 v7, v7
-; GFX9-NEXT:    v_mad_f32 v6, -v7, v5, v6
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v7, v7
-; GFX9-NEXT:    s_or_b32 s3, s3, 1
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v6|, |v5|
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v6, v1
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[2:3], 30
+; GFX9-NEXT:    s_bfe_i32 s5, s6, 0xf0000
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s5
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[6:7], 30
+; GFX9-NEXT:    s_bfe_i32 s7, s2, 0xf0000
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s7
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX9-NEXT:    s_xor_b32 s5, s7, s5
+; GFX9-NEXT:    s_ashr_i32 s5, s5, 30
+; GFX9-NEXT:    s_lshr_b32 s3, s2, 15
+; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX9-NEXT:    s_and_b32 s9, s4, 0x7fff
+; GFX9-NEXT:    s_and_b32 s12, s8, 0x7fff
+; GFX9-NEXT:    s_lshr_b32 s13, s6, 15
+; GFX9-NEXT:    s_or_b32 s5, s5, 1
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[10:11], |v1|, |v0|
+; GFX9-NEXT:    s_and_b64 s[10:11], s[10:11], exec
+; GFX9-NEXT:    s_cselect_b32 s5, s5, 0
+; GFX9-NEXT:    v_add_u32_e32 v0, s5, v2
+; GFX9-NEXT:    s_bfe_i32 s5, s6, 0xf000f
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s5
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s6
+; GFX9-NEXT:    s_bfe_i32 s6, s2, 0xf000f
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s6
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v1
+; GFX9-NEXT:    s_xor_b32 s5, s6, s5
+; GFX9-NEXT:    s_ashr_i32 s5, s5, 30
+; GFX9-NEXT:    s_or_b32 s5, s5, 1
+; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX9-NEXT:    v_mad_f32 v2, -v3, v1, v2
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v2|, |v1|
+; GFX9-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; GFX9-NEXT:    s_cselect_b32 s5, s5, 0
+; GFX9-NEXT:    v_add_u32_e32 v1, s5, v3
+; GFX9-NEXT:    s_bfe_i32 s5, s8, 0xf0000
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s5
+; GFX9-NEXT:    s_bfe_i32 s4, s4, 0xf0000
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s4
+; GFX9-NEXT:    s_xor_b32 s4, s4, s5
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GFX9-NEXT:    s_ashr_i32 s4, s4, 30
+; GFX9-NEXT:    s_or_b32 s6, s4, 1
+; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s13
+; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
+; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX9-NEXT:    v_mad_f32 v3, -v4, v2, v3
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, |v2|
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; GFX9-NEXT:    s_cselect_b32 s3, s3, 0
-; GFX9-NEXT:    v_add_u32_e32 v5, s3, v7
-; GFX9-NEXT:    v_bfe_i32 v7, v0, 0, 15
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v8, v7
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v9, v6
-; GFX9-NEXT:    v_xor_b32_e32 v1, v7, v1
-; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 30, v1
-; GFX9-NEXT:    v_or_b32_e32 v1, 1, v1
-; GFX9-NEXT:    v_mul_f32_e32 v7, v8, v9
-; GFX9-NEXT:    v_trunc_f32_e32 v7, v7
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v9, v7
-; GFX9-NEXT:    v_mad_f32 v7, -v7, v6, v8
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, |v6|
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s6
-; GFX9-NEXT:    v_mul_lo_u32 v5, v5, s7
-; GFX9-NEXT:    v_add_u32_e32 v1, v9, v1
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v3
-; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX9-NEXT:    v_sub_u32_e32 v3, s2, v4
-; GFX9-NEXT:    v_sub_u32_e32 v4, s8, v5
-; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff, v4
+; GFX9-NEXT:    s_cselect_b32 s4, s6, 0
+; GFX9-NEXT:    v_add_u32_e32 v2, s4, v4
+; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s12
+; GFX9-NEXT:    v_sub_u32_e32 v4, s2, v0
+; GFX9-NEXT:    v_sub_u32_e32 v5, s3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_sub_u32_e32 v0, s9, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff, v5
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
-; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
-; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
-; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX9-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
-; GFX9-NEXT:    global_store_short v2, v0, s[0:1] offset:4
+; GFX9-NEXT:    global_store_short v3, v0, s[0:1] offset:4
 ; GFX9-NEXT:    s_endpgm
   %r = srem <3 x i15> %x, %y
   store <3 x i15> %r, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 12f8a59f0b84b..d89b39348ad9a 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -17,50 +17,50 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
   ; GFX90A-NEXT:   renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4)
   ; GFX90A-NEXT:   renamable $sgpr24_sgpr25_sgpr26_sgpr27 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
-  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4)
+  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4)
   ; GFX90A-NEXT:   S_BITCMP1_B32 renamable $sgpr17, 0, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_CSELECT_B64 -1, 0, implicit killed $scc
-  ; GFX90A-NEXT:   renamable $sgpr34_sgpr35 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $sgpr28_sgpr29 = S_XOR_B64 renamable $sgpr12_sgpr13, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BITCMP1_B32 renamable $sgpr17, 8, implicit-def $scc
-  ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_CSELECT_B64 -1, 0, implicit killed $scc
-  ; GFX90A-NEXT:   renamable $sgpr30_sgpr31 = S_XOR_B64 killed renamable $sgpr18_sgpr19, -1, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr30_sgpr31 = S_CSELECT_B64 -1, 0, implicit killed $scc
+  ; GFX90A-NEXT:   renamable $sgpr30_sgpr31 = S_XOR_B64 killed renamable $sgpr30_sgpr31, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $vgpr5 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr4 = DS_READ_B32_gfx9 renamable $vgpr5, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) null`, align 8, addrspace 3)
-  ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr28_sgpr29, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_VCCZ %bb.2, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.1.bb103:
   ; GFX90A-NEXT:   successors: %bb.58(0x40000000), %bb.2(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr34_sgpr35 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.58, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.2:
   ; GFX90A-NEXT:   successors: %bb.3(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr46, $sgpr47, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr4, $vgpr5
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr4, $vgpr5
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
+  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14
   ; GFX90A-NEXT:   renamable $vgpr3 = IMPLICIT_DEF implicit-def $vgpr2
-  ; GFX90A-NEXT:   renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20
-  ; GFX90A-NEXT:   renamable $vgpr23 = IMPLICIT_DEF implicit-def $vgpr22
   ; GFX90A-NEXT:   renamable $vgpr25 = IMPLICIT_DEF implicit-def $vgpr24
+  ; GFX90A-NEXT:   renamable $vgpr27 = IMPLICIT_DEF implicit-def $vgpr26
+  ; GFX90A-NEXT:   renamable $vgpr29 = IMPLICIT_DEF implicit-def $vgpr28
   ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_MOV_B64 0
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.3.Flow17:
   ; GFX90A-NEXT:   successors: %bb.4(0x40000000), %bb.57(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr30 = V_AND_B32_e32 1023, $vgpr31, implicit $exec
-  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_VCCZ %bb.57, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.4.bb15:
   ; GFX90A-NEXT:   successors: %bb.35(0x40000000), %bb.5(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr4_vgpr5, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = COPY renamable $sgpr25, implicit $exec
@@ -75,12 +75,12 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.5:
   ; GFX90A-NEXT:   successors: %bb.6(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr34_sgpr35 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 0
@@ -98,47 +98,49 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr56_vgpr57 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr44_vgpr45 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr42_vgpr43 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr20 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14
-  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
+  ; GFX90A-NEXT:   renamable $vgpr22 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr12 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18
+  ; GFX90A-NEXT:   renamable $sgpr18 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.6.Flow20:
   ; GFX90A-NEXT:   successors: %bb.7(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr20 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr22 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr24 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr25 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr26 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr28 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr29 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr15 = COPY killed renamable $sgpr18, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr3 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr21 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr23 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr25 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr27 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.7.Flow19:
   ; GFX90A-NEXT:   successors: %bb.62(0x40000000), %bb.8(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_MOV_B64 0
-  ; GFX90A-NEXT:   $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr18_sgpr19 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.62, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.8.Flow32:
   ; GFX90A-NEXT:   successors: %bb.9(0x40000000), %bb.10(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def $scc
-  ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr18_sgpr19, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def $scc
+  ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr40_sgpr41, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.10, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.9.bb89:
   ; GFX90A-NEXT:   successors: %bb.10(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
@@ -146,16 +148,16 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.10.Flow33:
   ; GFX90A-NEXT:   successors: %bb.11(0x40000000), %bb.12(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc
-  ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr44_sgpr45, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr46_sgpr47, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.12, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.11.bb84:
   ; GFX90A-NEXT:   successors: %bb.12(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
@@ -163,16 +165,16 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.12.Flow34:
   ; GFX90A-NEXT:   successors: %bb.13(0x40000000), %bb.14(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc
-  ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr42_sgpr43, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr44_sgpr45, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.14, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.13.bb79:
   ; GFX90A-NEXT:   successors: %bb.14(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
@@ -180,10 +182,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.14.Flow35:
   ; GFX90A-NEXT:   successors: %bb.15(0x40000000), %bb.16(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc
-  ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr40_sgpr41, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr42_sgpr43, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_EXECZ %bb.16, implicit $exec
   ; GFX90A-NEXT: {{  $}}
@@ -357,15 +359,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.35.bb20:
   ; GFX90A-NEXT:   successors: %bb.37(0x40000000), %bb.36(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = FLAT_LOAD_SBYTE renamable $vgpr40_vgpr41, 1024, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i23)
   ; GFX90A-NEXT:   renamable $vgpr42 = V_ADD_CO_U32_e32 1024, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr34_sgpr35 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr38_sgpr39 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 0
@@ -383,33 +385,33 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr58_vgpr59 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr56_vgpr57 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr44_vgpr45 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr20 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14
-  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
+  ; GFX90A-NEXT:   renamable $vgpr22 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr12 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18
+  ; GFX90A-NEXT:   renamable $sgpr18 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.37, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.36.Flow21:
   ; GFX90A-NEXT:   successors: %bb.6(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.6
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.37.bb27:
   ; GFX90A-NEXT:   successors: %bb.39(0x40000000), %bb.38(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr44_sgpr45, $sgpr42_sgpr43, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr64_sgpr65, $sgpr50_sgpr51, $sgpr66_sgpr67
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr64_sgpr65, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr66_sgpr67, $sgpr48_sgpr49
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = FLAT_LOAD_UBYTE renamable $vgpr40_vgpr41, 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i30)
   ; GFX90A-NEXT:   renamable $vgpr44 = V_ADD_CO_U32_e32 2048, $vgpr40, implicit-def $vcc, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = COPY renamable $sgpr36_sgpr37
-  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = COPY renamable $sgpr36_sgpr37
   ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr45, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
@@ -422,49 +424,51 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr60_vgpr61 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr58_vgpr59 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr56_vgpr57 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr20 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14
-  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
+  ; GFX90A-NEXT:   renamable $vgpr22 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr12 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18
+  ; GFX90A-NEXT:   renamable $sgpr18 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $sgpr38_sgpr39 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.39, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.38.Flow22:
   ; GFX90A-NEXT:   successors: %bb.36(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr38_sgpr39, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr38_sgpr39 = S_XOR_B64 $exec, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_ANDN2_B64 killed renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_OR_B64 killed renamable $sgpr36_sgpr37, killed renamable $sgpr46_sgpr47, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_OR_B64 killed renamable $sgpr36_sgpr37, killed renamable $sgpr56_sgpr57, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.36
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.39.bb34:
   ; GFX90A-NEXT:   successors: %bb.41(0x40000000), %bb.40(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr44_sgpr45, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr52_sgpr53, $sgpr64_sgpr65, $sgpr66_sgpr67
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr66_sgpr67
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = FLAT_LOAD_UBYTE renamable $vgpr40_vgpr41, 3072, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i37)
   ; GFX90A-NEXT:   renamable $vgpr56 = V_ADD_CO_U32_e32 3072, $vgpr40, implicit-def $vcc, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = COPY renamable $sgpr36_sgpr37
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr57, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec
+  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
@@ -472,48 +476,48 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr62_vgpr63 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr60_vgpr61 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr58_vgpr59 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr20 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14
-  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
+  ; GFX90A-NEXT:   renamable $vgpr22 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr12 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18
+  ; GFX90A-NEXT:   renamable $sgpr18 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $sgpr40_sgpr41 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.41, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.40.Flow23:
   ; GFX90A-NEXT:   successors: %bb.38(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr68_sgpr69, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc
-  ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_XOR_B64 $exec, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, -1, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr48_sgpr49, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr48_sgpr49, killed renamable $sgpr50_sgpr51, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.38
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.41.bb41:
   ; GFX90A-NEXT:   successors: %bb.46(0x40000000), %bb.42(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr54_sgpr55, $sgpr66_sgpr67, $sgpr68_sgpr69
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = COPY $vcc
   ; GFX90A-NEXT:   renamable $vgpr59, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr0 = FLAT_LOAD_UBYTE renamable $vgpr58_vgpr59, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i44)
-  ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37
   ; GFX90A-NEXT:   renamable $vgpr3, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
@@ -522,271 +526,273 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr62_vgpr63 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr60_vgpr61 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr20 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14
-  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
+  ; GFX90A-NEXT:   renamable $vgpr22 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr12 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18
+  ; GFX90A-NEXT:   renamable $sgpr18 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $sgpr42_sgpr43 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.46, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.42.Flow24:
   ; GFX90A-NEXT:   successors: %bb.40(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc
   ; GFX90A-NEXT:   renamable $vgpr59 = COPY killed renamable $vgpr3, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr48_sgpr49, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_OR_B64 killed renamable $sgpr42_sgpr43, killed renamable $sgpr50_sgpr51, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.40
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.43.bb55:
   ; GFX90A-NEXT:   successors: %bb.48(0x40000000), %bb.44(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59, $sgpr48_sgpr49
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   S_BITCMP1_B32 killed renamable $sgpr17, 16, implicit-def $scc
-  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit killed $scc
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_XOR_B64 renamable $sgpr64_sgpr65, -1, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_CSELECT_B64 -1, 0, implicit killed $scc
+  ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_XOR_B64 renamable $sgpr66_sgpr67, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $vgpr62 = V_ADD_CO_U32_e32 6144, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr63, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr50_sgpr51, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr18_sgpr19, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.48, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.44:
   ; GFX90A-NEXT:   successors: %bb.45(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr57, $vgpr56, $vgpr30, $vgpr31, $vgpr60, $vgpr62, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $vgpr61, $vgpr58, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr47, $vgpr46, $vgpr2, $vgpr4, $vgpr5, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr40, $vgpr63
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr62, $vgpr56, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $vgpr57, $vgpr61, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr47, $vgpr46, $vgpr2, $vgpr4, $vgpr5, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr40, $vgpr60, $vgpr63, $vgpr58
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37
   ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr20 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14
-  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
+  ; GFX90A-NEXT:   renamable $vgpr22 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr12 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18
+  ; GFX90A-NEXT:   renamable $sgpr18 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 0
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.45.Flow26:
   ; GFX90A-NEXT:   successors: %bb.47(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_XOR_B64 $exec, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr70_sgpr71 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_XOR_B64 $exec, -1, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr70_sgpr71 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr46_sgpr47, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr50_sgpr51, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.47
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.46.bb48:
   ; GFX90A-NEXT:   successors: %bb.43(0x40000000), %bb.47(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr64_sgpr65, $sgpr50_sgpr51, $sgpr66_sgpr67, $sgpr44_sgpr45, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr66_sgpr67, $sgpr58_sgpr59, $sgpr68_sgpr69, $sgpr64_sgpr65, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr60_sgpr61
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = COPY $vcc
   ; GFX90A-NEXT:   renamable $vgpr0 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr1, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr0 = FLAT_LOAD_UBYTE killed renamable $vgpr0_vgpr1, 1024, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i51)
-  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37
-  ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr70_sgpr71 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr61, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr70_sgpr71 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr62_vgpr63 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr20 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14
-  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
-  ; GFX90A-NEXT:   $sgpr18_sgpr19 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr22 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr12 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18
+  ; GFX90A-NEXT:   renamable $sgpr18 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   $sgpr44_sgpr45 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.43, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.47.Flow25:
   ; GFX90A-NEXT:   successors: %bb.42(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def $scc
-  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr44_sgpr45, implicit-def $scc
+  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_XOR_B64 $exec, -1, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr48_sgpr49, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr50_sgpr51, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.42
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.48.bb63:
   ; GFX90A-NEXT:   successors: %bb.50(0x40000000), %bb.49(0x40000000)
-  ; GFX90A-NEXT:   liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49
+  ; GFX90A-NEXT:   liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57:0x000000000000000F, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59, $sgpr48_sgpr49
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 0
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.50, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.49:
   ; GFX90A-NEXT:   successors: %bb.44(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 -1
   ; GFX90A-NEXT:   S_BRANCH %bb.44
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.50.bb68:
   ; GFX90A-NEXT:   successors: %bb.54(0x40000000), %bb.51(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57:0x000000000000000F, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr30, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr1 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr50_sgpr51, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.54, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.51:
   ; GFX90A-NEXT:   successors: %bb.45(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37
   ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr20 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14
-  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
+  ; GFX90A-NEXT:   renamable $vgpr22 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr12 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18
+  ; GFX90A-NEXT:   renamable $sgpr18 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
   ; GFX90A-NEXT:   S_BRANCH %bb.45
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.52.bb80:
   ; GFX90A-NEXT:   successors: %bb.59(0x40000000), %bb.53(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc
   ; GFX90A-NEXT:   renamable $vgpr8 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr9, dead renamable $sgpr52_sgpr53 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr9, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_SCC1 %bb.59, implicit killed $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.53:
   ; GFX90A-NEXT:   successors: %bb.61(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = COPY renamable $sgpr36_sgpr37
   ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr20 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14
-  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
+  ; GFX90A-NEXT:   renamable $vgpr22 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr12 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18
+  ; GFX90A-NEXT:   renamable $sgpr18 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
   ; GFX90A-NEXT:   S_BRANCH %bb.61
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.54.bb73:
   ; GFX90A-NEXT:   successors: %bb.52(0x40000000), %bb.55(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr8 = FLAT_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i76)
   ; GFX90A-NEXT:   renamable $vgpr6 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37
-  ; GFX90A-NEXT:   renamable $vgpr7, dead renamable $sgpr58_sgpr59 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr7, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr8, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr20 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14
-  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
-  ; GFX90A-NEXT:   $sgpr60_sgpr61 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr22 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr12 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18
+  ; GFX90A-NEXT:   renamable $sgpr18 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   $sgpr62_sgpr63 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.52, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.55.Flow29:
   ; GFX90A-NEXT:   successors: %bb.45(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr60_sgpr61, implicit-def $scc
+  ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr62_sgpr63, implicit-def $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.45
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.56.bb90:
   ; GFX90A-NEXT:   successors: %bb.60(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr12 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr12, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
-  ; GFX90A-NEXT:   renamable $vgpr12 = COPY renamable $sgpr21, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr12, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3)
-  ; GFX90A-NEXT:   renamable $vgpr12 = COPY renamable $sgpr22, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr12, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3)
-  ; GFX90A-NEXT:   renamable $vgpr12 = COPY renamable $sgpr46, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr13 = V_ALIGNBIT_B32_opsel_e64 0, killed $sgpr47, 0, killed $vgpr12, 0, 1, 0, 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr52 = V_ALIGNBIT_B32_opsel_e64 0, $vgpr19, 0, $vgpr18, 0, 1, 0, 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr19 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr17 = V_ALIGNBIT_B32_opsel_e64 0, $vgpr17, 0, $vgpr16, 0, 1, 0, 0, implicit $exec
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   renamable $vgpr12 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr66_sgpr67, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr13 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr14 = COPY renamable $sgpr21, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr14, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr13, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr13 = COPY renamable $sgpr22, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr13, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3)
+  ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_LSHR_B64 killed renamable $sgpr56_sgpr57, 1, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $vgpr16_vgpr17 = V_LSHRREV_B64_e64 1, $vgpr22_vgpr23, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr17 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr32_vgpr33 = V_LSHRREV_B64_e64 1, $vgpr20_vgpr21, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $vgpr12 = COPY renamable $vgpr16, implicit $exec
+  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $vgpr14 = COPY renamable $vgpr20, implicit $exec
   ; GFX90A-NEXT:   S_BRANCH %bb.60
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.57:
   ; GFX90A-NEXT:   successors: %bb.7(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr40_sgpr41, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr19 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr52 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr22 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr18 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr14 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr19 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr12 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr20 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr32 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr16 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr15 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr53 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr66_sgpr67 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 0
@@ -812,79 +818,80 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.58.bb105:
   ; GFX90A-NEXT:   successors: %bb.3(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr26_vgpr27 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $sgpr23, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.419, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.419, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $sgpr21, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2_vgpr3 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr0 = COPY killed renamable $sgpr33, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr12_vgpr13 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.420, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.420, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $sgpr22, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr28_vgpr29 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3)
   ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_MOV_B64 -1
   ; GFX90A-NEXT:   S_BRANCH %bb.3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.59.bb85:
   ; GFX90A-NEXT:   successors: %bb.56(0x40000000), %bb.60(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr10 = V_OR_B32_e32 1, $vgpr8, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr11 = COPY renamable $vgpr9, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr12 = FLAT_LOAD_UBYTE renamable $vgpr10_vgpr11, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i86)
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr12, implicit $exec
-  ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37
-  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = COPY renamable $sgpr36_sgpr37
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr32 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr20 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14
-  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
+  ; GFX90A-NEXT:   renamable $vgpr22 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr12 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18
+  ; GFX90A-NEXT:   renamable $sgpr18 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $sgpr54_sgpr55 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.56, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.60.Flow31:
   ; GFX90A-NEXT:   successors: %bb.61(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr54_sgpr55, implicit-def $scc
-  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 0
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.61.Flow30:
   ; GFX90A-NEXT:   successors: %bb.55(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_XOR_B64 $exec, -1, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_XOR_B64 $exec, -1, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
-  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr52_sgpr53, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_OR_B64 killed renamable $sgpr52_sgpr53, killed renamable $sgpr56_sgpr57, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.55
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.62.bb140:
   ; GFX90A-NEXT:   successors: %bb.68(0x40000000), %bb.63(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_MOV_B64 -1
+  ; GFX90A-NEXT:   renamable $sgpr24_sgpr25 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.68, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.63.Flow13:
   ; GFX90A-NEXT:   successors: %bb.64(0x40000000), %bb.66(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def dead $scc
+  ; GFX90A-NEXT:   $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.66, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.64.bb159:
   ; GFX90A-NEXT:   successors: %bb.67(0x40000000), %bb.65(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr30, implicit $exec
   ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -893,104 +900,106 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.65.Flow10:
   ; GFX90A-NEXT:   successors: %bb.66(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_ANDN2_SAVEEXEC_B64 $sgpr12_sgpr13, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.66.Flow14:
   ; GFX90A-NEXT:   successors: %bb.8(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr68_sgpr69 = COPY $exec
   ; GFX90A-NEXT:   S_BRANCH %bb.8
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.67.bb161:
   ; GFX90A-NEXT:   successors: %bb.65(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr21, killed $vgpr23, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr25, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr3 = V_OR_B32_e32 killed $vgpr13, killed $vgpr3, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr25, killed $vgpr27, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr29, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr3 = V_OR_B32_e32 killed $vgpr15, killed $vgpr3, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr3, killed $vgpr2, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr3 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr53, 0, $vgpr3, 0, 0, 6, implicit $exec
+  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr12, 0, $vgpr3, 0, 0, 6, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr4 = V_OR_B32_e32 killed $vgpr52, killed $vgpr15, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr4 = V_OR_B32_e32 killed $vgpr16, killed $vgpr19, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr4, killed $vgpr2, implicit $exec
-  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr19, 0, $vgpr3, 0, 0, 6, implicit $exec
+  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr17, 0, $vgpr3, 0, 0, 6, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr17, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr32, implicit $exec
   ; GFX90A-NEXT:   DS_WRITE2_B32_gfx9 killed renamable $vgpr3, killed renamable $vgpr2, renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3)
   ; GFX90A-NEXT:   S_BRANCH %bb.65
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.68.bb174:
   ; GFX90A-NEXT:   successors: %bb.72(0x40000000), %bb.69(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr26 = V_OR_B32_e32 1, $vgpr24, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr48 = V_OR_B32_e32 $vgpr26, $vgpr22, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr34 = V_OR_B32_e32 $vgpr48, $vgpr20, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr28 = V_CNDMASK_B32_e64 0, $vgpr34, 0, 0, $sgpr12_sgpr13, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr38 = V_OR_B32_e32 $vgpr28, $vgpr2, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr36 = V_OR_B32_e32 $vgpr38, $vgpr12, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr32 = V_OR_B32_e32 $vgpr36, $vgpr14, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr50 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr32, killed $sgpr12_sgpr13, implicit $exec
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   renamable $agpr0 = COPY killed renamable $vgpr32, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr32 = V_OR_B32_e32 1, $vgpr28, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr52 = V_OR_B32_e32 $vgpr32, $vgpr26, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr38 = V_OR_B32_e32 $vgpr52, $vgpr24, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr34 = V_CNDMASK_B32_e64 0, $vgpr38, 0, 0, $sgpr12_sgpr13, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr50 = V_OR_B32_e32 $vgpr34, $vgpr2, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr48 = V_OR_B32_e32 $vgpr50, $vgpr14, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr36 = V_OR_B32_e32 $vgpr48, $vgpr18, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr54 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr36, killed $sgpr12_sgpr13, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr28_sgpr29, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.72, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.69.Flow:
   ; GFX90A-NEXT:   successors: %bb.70(0x40000000), %bb.71(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr54_vgpr55:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.71, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.70.bb186:
   ; GFX90A-NEXT:   successors: %bb.71(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr54_vgpr55:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = V_LSHLREV_B64_e64 3, killed $vgpr4_vgpr5, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = COPY renamable $sgpr27, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr4, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr26, $vgpr4, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr2, killed $vgpr5, killed $vcc, 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr27 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr49 = COPY renamable $vgpr27, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr35 = COPY renamable $vgpr27, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr39 = COPY renamable $vgpr27, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr37 = COPY renamable $vgpr27, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr29 = COPY renamable $vgpr27, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr51 = COPY renamable $vgpr27, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr33 = COPY renamable $vgpr27, implicit $exec
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr27, renamable $vgpr26_vgpr27, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr33 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr53 = COPY renamable $vgpr33, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr39 = COPY renamable $vgpr33, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr51 = COPY renamable $vgpr33, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr49 = COPY renamable $vgpr33, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr35 = COPY renamable $vgpr33, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr55 = COPY renamable $vgpr33, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr37 = COPY renamable $vgpr33, implicit $exec
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr33, renamable $vgpr32_vgpr33, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr5 = COPY renamable $sgpr21, implicit $exec
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr52_vgpr53, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr13 = COPY killed renamable $sgpr22, implicit $exec
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr13, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into %ir.8, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr33, killed renamable $vgpr50_vgpr51, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
-  ; GFX90A-NEXT:   renamable $vgpr12 = COPY killed renamable $sgpr22, implicit $exec
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr12, killed renamable $vgpr34_vgpr35, 0, 0, implicit $exec :: (store (s64) into %ir.8, addrspace 3)
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr27, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr27, killed renamable $vgpr28_vgpr29, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr5, killed renamable $vgpr50_vgpr51, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr27, killed renamable $vgpr32_vgpr33, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr33, killed renamable $vgpr34_vgpr35, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr5, killed renamable $vgpr54_vgpr55, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr33, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.71.Flow9:
   ; GFX90A-NEXT:   successors: %bb.63(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $sgpr24_sgpr25 = S_MOV_B64 0
+  ; GFX90A-NEXT:   renamable $vgpr32 = COPY killed renamable $agpr0, implicit $exec
   ; GFX90A-NEXT:   S_BRANCH %bb.63
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.72.bb196:
   ; GFX90A-NEXT:   successors: %bb.69(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr54_vgpr55:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 $vgpr50, killed $vgpr18, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr54 = V_OR_B32_e32 killed $vgpr2, killed $vgpr16, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr55 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr55, renamable $vgpr54_vgpr55, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 $vgpr54, killed $vgpr22, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr20 = V_OR_B32_e32 killed $vgpr2, killed $vgpr20, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr21 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr21, renamable $vgpr20_vgpr21, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_MOV_B64 0
   ; GFX90A-NEXT:   S_BRANCH %bb.69
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 306fe33bfb7ac..8e12e7e03947b 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -2523,7 +2523,7 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
 ; CI-NEXT:    s_add_u32 s36, s36, s3
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; CI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[0:3], 0
 ; CI-NEXT:    s_addc_u32 s37, s37, 0
 ; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
@@ -2533,9 +2533,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
 ; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; CI-NEXT:    s_mov_b32 s32, 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_alignbit_b32 v1, v3, v2, 16
-; CI-NEXT:    v_mov_b32_e32 v0, v2
-; CI-NEXT:    v_mov_b32_e32 v2, v3
+; CI-NEXT:    v_lshr_b64 v[1:2], v[3:4], 16
+; CI-NEXT:    v_mov_b32_e32 v0, v3
+; CI-NEXT:    v_mov_b32_e32 v2, v4
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
index a92b99aa1e2c1..1c5f8c84e447d 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
@@ -173,11 +173,12 @@ ret:
 ; GCN-LABEL: {{^}}sink_ubfe_i64_span_midpoint:
 
 ; GCN: s_cbranch_scc{{[0-1]}} .LBB3_2
-; GCN: v_alignbit_b32 v[[LO:[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, 30
-; GCN: v_and_b32_e32 v{{[0-9]+}}, 0x7f, v[[LO]]
+; GCN: s_lshr_b64 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], s[[[LO2:[0-9]+]]:[[HI2:[0-9]+]]], 30
+; GCN: s_and_b32 s{{[0-9]+}},  s[[LO]], 0x7f
 
 ; GCN: .LBB3_3:
-; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xff, v[[LO]]
+; GCN: s_lshr_b64 s[[[LO3:[0-9]+]]:[[HI3:[0-9]+]]], s[[[LO4:[0-9]+]]:[[HI4:[0-9]+]]], 30
+; GCN: s_and_b32 s{{[0-9]+}},  s[[LO3]], 0xff
 
 ; GCN: buffer_store_dwordx2
 define amdgpu_kernel void @sink_ubfe_i64_span_midpoint(ptr addrspace(1) %out, i64 %arg1, i1 %arg) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 6a898fa799f3e..57cea74b0e9ea 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -1153,12 +1153,12 @@ define amdgpu_ps i32 @s_copysign_v2bf16(<2 x bfloat> inreg %arg_mag, <2 x bfloat
 define amdgpu_ps <3 x i16> @s_copysign_v3bf16(<3 x bfloat> inreg %arg_mag, <3 x bfloat> inreg %arg_sign) {
 ; GCN-LABEL: s_copysign_v3bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    v_mul_f32_e64 v0, 1.0, s5
+; GCN-NEXT:    v_mul_f32_e64 v0, 1.0, s3
 ; GCN-NEXT:    v_mul_f32_e64 v1, 1.0, s4
-; GCN-NEXT:    v_mul_f32_e64 v2, 1.0, s3
-; GCN-NEXT:    v_mul_f32_e64 v3, 1.0, s2
+; GCN-NEXT:    v_mul_f32_e64 v2, 1.0, s5
+; GCN-NEXT:    v_mul_f32_e64 v3, 1.0, s0
 ; GCN-NEXT:    v_mul_f32_e64 v4, 1.0, s1
-; GCN-NEXT:    v_mul_f32_e64 v5, 1.0, s0
+; GCN-NEXT:    v_mul_f32_e64 v5, 1.0, s2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
@@ -1166,45 +1166,45 @@ define amdgpu_ps <3 x i16> @s_copysign_v3bf16(<3 x bfloat> inreg %arg_mag, <3 x
 ; GCN-NEXT:    v_bfe_u32 v4, v4, 16, 15
 ; GCN-NEXT:    v_bfe_u32 v3, v3, 16, 15
 ; GCN-NEXT:    v_and_b32_e32 v2, 0x8000, v2
-; GCN-NEXT:    v_and_b32_e32 v1, 0x8000, v1
+; GCN-NEXT:    v_and_b32_e32 v6, 0x8000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x8000, v0
-; GCN-NEXT:    v_or_b32_e32 v2, v5, v2
-; GCN-NEXT:    v_or_b32_e32 v1, v4, v1
-; GCN-NEXT:    v_or_b32_e32 v0, v3, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_or_b32_e32 v2, v2, v1
-; GCN-NEXT:    v_alignbit_b32 v1, v0, v1, 16
-; GCN-NEXT:    v_readfirstlane_b32 s1, v1
-; GCN-NEXT:    v_readfirstlane_b32 s0, v2
-; GCN-NEXT:    v_readfirstlane_b32 s2, v0
+; GCN-NEXT:    v_or_b32_e32 v1, v5, v2
+; GCN-NEXT:    v_or_b32_e32 v2, v4, v6
+; GCN-NEXT:    v_or_b32_e32 v3, v3, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GCN-NEXT:    v_or_b32_e32 v4, v3, v0
+; GCN-NEXT:    v_lshr_b64 v[2:3], v[0:1], 16
+; GCN-NEXT:    v_readfirstlane_b32 s0, v4
+; GCN-NEXT:    v_readfirstlane_b32 s1, v2
+; GCN-NEXT:    v_readfirstlane_b32 s2, v1
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX7-LABEL: s_copysign_v3bf16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    v_mul_f32_e64 v1, 1.0, s4
-; GFX7-NEXT:    v_mul_f32_e64 v0, 1.0, s5
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_mul_f32_e64 v2, 1.0, s3
-; GFX7-NEXT:    v_mul_f32_e64 v4, 1.0, s1
+; GFX7-NEXT:    v_mul_f32_e64 v0, 1.0, s3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX7-NEXT:    v_mul_f32_e64 v0, 1.0, s4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    v_mul_f32_e64 v3, 1.0, s2
-; GFX7-NEXT:    v_mul_f32_e64 v5, 1.0, s0
-; GFX7-NEXT:    v_and_b32_e32 v1, 0x8000, v1
+; GFX7-NEXT:    v_mul_f32_e64 v1, 1.0, s5
+; GFX7-NEXT:    v_mul_f32_e64 v4, 1.0, s1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_mul_f32_e64 v3, 1.0, s0
+; GFX7-NEXT:    v_mul_f32_e64 v5, 1.0, s2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x8000, v0
 ; GFX7-NEXT:    v_bfe_u32 v4, v4, 16, 15
-; GFX7-NEXT:    v_and_b32_e32 v2, 0x8000, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x8000, v1
 ; GFX7-NEXT:    v_bfe_u32 v5, v5, 16, 15
-; GFX7-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0x8000, v0
+; GFX7-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0x8000, v2
 ; GFX7-NEXT:    v_bfe_u32 v3, v3, 16, 15
-; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX7-NEXT:    v_or_b32_e32 v2, v2, v1
-; GFX7-NEXT:    v_alignbit_b32 v1, v0, v1, 16
-; GFX7-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX7-NEXT:    v_readfirstlane_b32 s0, v2
-; GFX7-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX7-NEXT:    v_or_b32_e32 v1, v5, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX7-NEXT:    v_or_b32_e32 v4, v2, v0
+; GFX7-NEXT:    v_lshr_b64 v[2:3], v[0:1], 16
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX7-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX7-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_copysign_v3bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 574c1042859aa..fc33ae277df70 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1450,29 +1450,29 @@ define amdgpu_ps <3 x i16> @s_copysign_v3f16(<3 x half> inreg %arg_mag, <3 x hal
 ; SI:       ; %bb.0:
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, s4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, s1
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, s5
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, s2
-; SI-NEXT:    v_cvt_f16_f32_e32 v4, s3
-; SI-NEXT:    v_cvt_f16_f32_e32 v5, s0
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, s5
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, s2
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, s3
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, s0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    s_brev_b32 s0, -2
 ; SI-NEXT:    v_bfi_b32 v2, s0, v3, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_bfi_b32 v3, s0, v5, v4
 ; SI-NEXT:    v_bfi_b32 v0, s0, v1, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; SI-NEXT:    v_or_b32_e32 v2, v3, v1
-; SI-NEXT:    v_alignbit_b32 v1, v0, v1, 16
-; SI-NEXT:    v_readfirstlane_b32 s1, v1
-; SI-NEXT:    v_readfirstlane_b32 s0, v2
-; SI-NEXT:    v_readfirstlane_b32 s2, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; SI-NEXT:    v_lshr_b64 v[2:3], v[0:1], 16
+; SI-NEXT:    v_or_b32_e32 v4, v4, v0
+; SI-NEXT:    v_readfirstlane_b32 s0, v4
+; SI-NEXT:    v_readfirstlane_b32 s1, v2
+; SI-NEXT:    v_readfirstlane_b32 s2, v1
 ; SI-NEXT:    ; return to shader part epilog
 ;
 ; VI-LABEL: s_copysign_v3f16:
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index 38003f6075c35..0084d936ec03b 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -895,11 +895,11 @@ define <3 x i16> @v3i16_func_void() #0 {
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    s_mov_b32 s6, -1
-; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; CI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_alignbit_b32 v1, v3, v2, 16
-; CI-NEXT:    v_mov_b32_e32 v0, v2
-; CI-NEXT:    v_mov_b32_e32 v2, v3
+; CI-NEXT:    v_lshr_b64 v[1:2], v[3:4], 16
+; CI-NEXT:    v_mov_b32_e32 v0, v3
+; CI-NEXT:    v_mov_b32_e32 v2, v4
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-LABEL: v3i16_func_void:
@@ -1008,7 +1008,7 @@ define <5 x i16> @v5i16_func_void() #0 {
 ; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
 ; CI-NEXT:    buffer_load_sshort v4, off, s[4:7], 0 offset:8
 ; CI-NEXT:    s_waitcnt vmcnt(1)
-; CI-NEXT:    v_alignbit_b32 v5, v1, v0, 16
+; CI-NEXT:    v_lshr_b64 v[5:6], v[0:1], 16
 ; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; CI-NEXT:    v_mov_b32_e32 v2, v1
 ; CI-NEXT:    v_mov_b32_e32 v1, v5
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index eb5c5ef15ed56..6b094247e113c 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -111,22 +111,20 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB2_8
 ; GFX11-NEXT:  ; %bb.5: ; %bb18.preheader
 ; GFX11-NEXT:    s_load_b128 s[28:31], s[16:17], 0x44
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mul_hi_u32 s0, s29, s28
-; GFX11-NEXT:    s_mul_i32 s1, s29, s28
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_alignbit_b32 v0, s0, s1, 1
+; GFX11-NEXT:    s_mul_hi_u32 s1, s29, s28
+; GFX11-NEXT:    s_mul_i32 s0, s29, s28
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
 ; GFX11-NEXT:    s_mov_b32 s1, 0
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_or_b32 s0, s0, 1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshr_b32 s0, s0, s30
-; GFX11-NEXT:    s_mul_i32 s0, s0, s22
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_mul_i32 s0, s0, s22
 ; GFX11-NEXT:    s_mul_i32 s0, s0, s20
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_or_b32 s0, s19, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshl_b64 s[20:21], s[0:1], 1
 ; GFX11-NEXT:    s_mov_b32 s0, s1
 ; GFX11-NEXT:    global_load_u16 v1, v0, s[20:21]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll
index 103110af108de..fa6d878ad7556 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll
@@ -186,7 +186,7 @@ define amdgpu_ps ptr addrspace(8) @variable_top_half(ptr inreg %p, i64 inreg %nu
   ; CHECK45-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; CHECK45-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
   ; CHECK45-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 25
-  ; CHECK45-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], killed [[S_MOV_B32_]], implicit-def dead $scc
+  ; CHECK45-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 killed [[COPY5]], killed [[S_MOV_B32_]], implicit-def dead $scc
   ; CHECK45-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; CHECK45-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_LSHL_B32_]], %subreg.sub1
   ; CHECK45-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[REG_SEQUENCE]], killed [[REG_SEQUENCE2]], implicit-def dead $scc
@@ -197,29 +197,28 @@ define amdgpu_ps ptr addrspace(8) @variable_top_half(ptr inreg %p, i64 inreg %nu
   ; CHECK45-NEXT:   [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; CHECK45-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[DEF]], %subreg.sub0, killed [[S_LSHL_B32_1]], %subreg.sub1
   ; CHECK45-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 7
-  ; CHECK45-NEXT:   [[S_LSHR_B64_:%[0-9]+]]:sreg_64 = S_LSHR_B64 [[REG_SEQUENCE1]], [[S_MOV_B32_3]], implicit-def dead $scc
-  ; CHECK45-NEXT:   [[S_OR_B64_1:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[S_LSHR_B64_]], killed [[REG_SEQUENCE3]], implicit-def dead $scc
+  ; CHECK45-NEXT:   [[S_LSHR_B64_:%[0-9]+]]:sreg_64 = S_LSHR_B64 [[REG_SEQUENCE1]], killed [[S_MOV_B32_3]], implicit-def dead $scc
+  ; CHECK45-NEXT:   [[S_OR_B64_1:%[0-9]+]]:sreg_64 = S_OR_B64 [[S_LSHR_B64_]], killed [[REG_SEQUENCE3]], implicit-def dead $scc
   ; CHECK45-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 16384
   ; CHECK45-NEXT:   [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; CHECK45-NEXT:   [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; CHECK45-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[DEF2]], %subreg.sub0, killed [[S_MOV_B32_4]], %subreg.sub1
   ; CHECK45-NEXT:   [[S_OR_B64_2:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[S_OR_B64_1]], killed [[REG_SEQUENCE4]], implicit-def dead $scc
   ; CHECK45-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_2]].sub1
-  ; CHECK45-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; CHECK45-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
-  ; CHECK45-NEXT:   [[V_ALIGNBIT_B32_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_fake16_e64 0, killed [[COPY8]], 0, [[COPY5]], 0, [[COPY9]], 0, 0, implicit $exec
-  ; CHECK45-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[V_ALIGNBIT_B32_fake16_e64_]], implicit $exec
-  ; CHECK45-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
-  ; CHECK45-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec
-  ; CHECK45-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+  ; CHECK45-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+  ; CHECK45-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY8]], implicit $exec
+  ; CHECK45-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+  ; CHECK45-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec
+  ; CHECK45-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; CHECK45-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]]
   ; CHECK45-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY11]], implicit $exec
-  ; CHECK45-NEXT:   [[COPY12:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; CHECK45-NEXT:   [[COPY12:%[0-9]+]]:sreg_32 = COPY [[S_LSHR_B64_]].sub0
   ; CHECK45-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY12]]
   ; CHECK45-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
-  ; CHECK45-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_3]]
-  ; CHECK45-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_2]]
-  ; CHECK45-NEXT:   $sgpr2 = COPY [[V_READFIRSTLANE_B32_]]
-  ; CHECK45-NEXT:   $sgpr3 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; CHECK45-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]]
+  ; CHECK45-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; CHECK45-NEXT:   $sgpr2 = COPY [[V_READFIRSTLANE_B32_3]]
+  ; CHECK45-NEXT:   $sgpr3 = COPY [[V_READFIRSTLANE_B32_]]
   ; CHECK45-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3
   %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 4, i64 %numVals, i32 %flags)
   ret ptr addrspace(8) %rsrc
@@ -270,13 +269,13 @@ define amdgpu_ps ptr addrspace(8) @general_case(ptr inreg %p, i16 inreg %stride,
   ; CHECK45-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1
   ; CHECK45-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; CHECK45-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 7
-  ; CHECK45-NEXT:   [[S_LSHR_B64_:%[0-9]+]]:sreg_64 = S_LSHR_B64 [[REG_SEQUENCE1]], [[S_MOV_B32_]], implicit-def dead $scc
+  ; CHECK45-NEXT:   [[S_LSHR_B64_:%[0-9]+]]:sreg_64 = S_LSHR_B64 [[REG_SEQUENCE1]], killed [[S_MOV_B32_]], implicit-def dead $scc
   ; CHECK45-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY3]]
   ; CHECK45-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 12
   ; CHECK45-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 killed [[COPY6]], killed [[S_MOV_B32_1]], implicit-def dead $scc
   ; CHECK45-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; CHECK45-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, killed [[S_LSHL_B32_]], %subreg.sub1
-  ; CHECK45-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[S_LSHR_B64_]], killed [[REG_SEQUENCE2]], implicit-def dead $scc
+  ; CHECK45-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[S_LSHR_B64_]], killed [[REG_SEQUENCE2]], implicit-def dead $scc
   ; CHECK45-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 28
   ; CHECK45-NEXT:   [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY]], killed [[S_MOV_B32_3]], implicit-def dead $scc
   ; CHECK45-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, killed [[S_LSHL_B32_1]], %subreg.sub1
@@ -284,25 +283,24 @@ define amdgpu_ps ptr addrspace(8) @general_case(ptr inreg %p, i16 inreg %stride,
   ; CHECK45-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_1]].sub1
   ; CHECK45-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
   ; CHECK45-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 25
-  ; CHECK45-NEXT:   [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY8]], killed [[S_MOV_B32_4]], implicit-def dead $scc
+  ; CHECK45-NEXT:   [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 killed [[COPY8]], killed [[S_MOV_B32_4]], implicit-def dead $scc
   ; CHECK45-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, killed [[S_LSHL_B32_2]], %subreg.sub1
   ; CHECK45-NEXT:   [[S_OR_B64_2:%[0-9]+]]:sreg_64 = S_OR_B64 [[REG_SEQUENCE]], killed [[REG_SEQUENCE4]], implicit-def dead $scc
   ; CHECK45-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_2]].sub1
-  ; CHECK45-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
-  ; CHECK45-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; CHECK45-NEXT:   [[V_ALIGNBIT_B32_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_fake16_e64 0, killed [[COPY10]], 0, [[COPY8]], 0, [[COPY11]], 0, 0, implicit $exec
-  ; CHECK45-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[V_ALIGNBIT_B32_fake16_e64_]], implicit $exec
-  ; CHECK45-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
-  ; CHECK45-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY12]], implicit $exec
-  ; CHECK45-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+  ; CHECK45-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+  ; CHECK45-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec
+  ; CHECK45-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+  ; CHECK45-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY11]], implicit $exec
+  ; CHECK45-NEXT:   [[COPY12:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; CHECK45-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY12]]
   ; CHECK45-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec
-  ; CHECK45-NEXT:   [[COPY14:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+  ; CHECK45-NEXT:   [[COPY14:%[0-9]+]]:sreg_32 = COPY [[S_LSHR_B64_]].sub0
   ; CHECK45-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY14]]
   ; CHECK45-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY15]], implicit $exec
-  ; CHECK45-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_3]]
-  ; CHECK45-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
-  ; CHECK45-NEXT:   $sgpr2 = COPY [[V_READFIRSTLANE_B32_]]
-  ; CHECK45-NEXT:   $sgpr3 = COPY [[V_READFIRSTLANE_B32_2]]
+  ; CHECK45-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]]
+  ; CHECK45-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_]]
+  ; CHECK45-NEXT:   $sgpr2 = COPY [[V_READFIRSTLANE_B32_3]]
+  ; CHECK45-NEXT:   $sgpr3 = COPY [[V_READFIRSTLANE_B32_1]]
   ; CHECK45-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3
   %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 %stride, i64 %numVals, i32 %flags)
   ret ptr addrspace(8) %rsrc
diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
index bf8994e005fc5..3d9c2a29cb9c1 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
@@ -815,9 +815,10 @@ define amdgpu_kernel void @test_umul_i24(ptr addrspace(1) %out, i32 %arg) {
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_lshr_b32 s2, s2, 9
-; SI-NEXT:    v_mul_hi_u32 v0, s2, v0
-; SI-NEXT:    s_mul_i32 s2, s2, 0xff803fe1
-; SI-NEXT:    v_alignbit_b32 v0, v0, s2, 1
+; SI-NEXT:    s_mul_i32 s4, s2, 0xff803fe1
+; SI-NEXT:    v_mul_hi_u32 v1, s2, v0
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_lshr_b64 v[0:1], v[0:1], 1
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -832,7 +833,7 @@ define amdgpu_kernel void @test_umul_i24(ptr addrspace(1) %out, i32 %arg) {
 ; VI-NEXT:    s_lshr_b32 s0, s0, 9
 ; VI-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s0, v0, 0
 ; VI-NEXT:    s_mov_b64 s[0:1], 0
-; VI-NEXT:    v_alignbit_b32 v0, v1, v0, 1
+; VI-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
 ; VI-NEXT:    s_nop 2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
@@ -844,11 +845,11 @@ define amdgpu_kernel void @test_umul_i24(ptr addrspace(1) %out, i32 %arg) {
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshr_b32 s2, s2, 9
-; GFX9-NEXT:    s_mul_hi_u32 s4, s2, 0xff803fe1
-; GFX9-NEXT:    s_mul_i32 s2, s2, 0xff803fe1
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, 1
+; GFX9-NEXT:    s_mul_hi_u32 s5, s2, 0xff803fe1
+; GFX9-NEXT:    s_mul_i32 s4, s2, 0xff803fe1
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], 1
 ; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index ddec6af0af69e..697bcc3b8fb47 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -1073,27 +1073,26 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    s_sext_i32_i16 s1, s9
-; GCN-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NEXT:    v_alignbit_b32 v0, s1, v0, 24
-; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    s_sext_i32_i16 s0, s3
-; GCN-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-NEXT:    v_alignbit_b32 v2, s0, v2, 24
-; GCN-NEXT:    v_cvt_f32_i32_e32 v3, v2
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v1
-; GCN-NEXT:    v_xor_b32_e32 v0, v2, v0
-; GCN-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
-; GCN-NEXT:    v_or_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_mul_f32_e32 v2, v3, v4
+; GCN-NEXT:    s_sext_i32_i16 s9, s9
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_lshr_b64 s[0:1], s[8:9], 24
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GCN-NEXT:    s_sext_i32_i16 s3, s3
+; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], 24
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s2
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT:    s_xor_b32 s0, s2, s0
+; GCN-NEXT:    s_ashr_i32 s0, s0, 30
+; GCN-NEXT:    s_or_b32 s2, s0, 1
+; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-NEXT:    v_mad_f32 v3, -v2, v1, v3
+; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v1|
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
+; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN-NEXT:    s_cselect_b32 s0, s2, 0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -1107,27 +1106,26 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_mov_b32 s5, s1
-; GCN-IR-NEXT:    s_sext_i32_i16 s1, s9
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-IR-NEXT:    v_alignbit_b32 v0, s1, v0, 24
-; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, v0
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
-; GCN-IR-NEXT:    s_sext_i32_i16 s0, s3
-; GCN-IR-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-IR-NEXT:    v_alignbit_b32 v2, s0, v2, 24
-; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v3, v2
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v4, v1
-; GCN-IR-NEXT:    v_xor_b32_e32 v0, v2, v0
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
-; GCN-IR-NEXT:    v_or_b32_e32 v0, 1, v0
-; GCN-IR-NEXT:    v_mul_f32_e32 v2, v3, v4
+; GCN-IR-NEXT:    s_sext_i32_i16 s9, s9
+; GCN-IR-NEXT:    s_mov_b32 s5, s1
+; GCN-IR-NEXT:    s_lshr_b64 s[0:1], s[8:9], 24
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GCN-IR-NEXT:    s_sext_i32_i16 s3, s3
+; GCN-IR-NEXT:    s_lshr_b64 s[2:3], s[2:3], 24
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s2
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-IR-NEXT:    s_xor_b32 s0, s2, s0
+; GCN-IR-NEXT:    s_ashr_i32 s0, s0, 30
+; GCN-IR-NEXT:    s_or_b32 s2, s0, 1
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_mad_f32 v3, -v2, v1, v3
+; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v1|
-; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
+; GCN-IR-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GCN-IR-NEXT:    s_cselect_b32 s0, s2, 0
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dword v0, off, s[4:7], 0
diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
index 59a884c829312..760a126afa995 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
@@ -281,9 +281,9 @@ define amdgpu_kernel void @v_uextract_bit_31_32_i64(ptr addrspace(1) %out, ptr a
 ; GCN-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
 ; GCN-NEXT:    s_mov_b64 s[6:7], s[10:11]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_alignbit_b32 v2, v3, v2, 31
-; GCN-NEXT:    v_and_b32_e32 v2, 3, v2
+; GCN-NEXT:    v_lshr_b64 v[2:3], v[2:3], 31
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    v_and_b32_e32 v2, 3, v2
 ; GCN-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GCN-NEXT:    s_endpgm
   %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 2d95875cad882..465024a699d43 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -1232,34 +1232,33 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 %y) {
 ; GCN-LABEL: s_test_srem24_48:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_sext_i32_i16 s9, s9
+; GCN-NEXT:    s_lshr_b64 s[4:5], s[8:9], 24
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s4
 ; GCN-NEXT:    s_sext_i32_i16 s3, s3
-; GCN-NEXT:    s_sext_i32_i16 s5, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_alignbit_b32 v0, s5, v0, 24
-; GCN-NEXT:    v_cvt_f32_i32_e32 v1, v0
-; GCN-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-NEXT:    v_alignbit_b32 v2, s3, v2, 24
-; GCN-NEXT:    v_cvt_f32_i32_e32 v3, v2
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v1
-; GCN-NEXT:    v_xor_b32_e32 v5, v2, v0
-; GCN-NEXT:    v_ashrrev_i32_e32 v5, 30, v5
-; GCN-NEXT:    v_or_b32_e32 v5, 1, v5
-; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
-; GCN-NEXT:    v_trunc_f32_e32 v4, v4
-; GCN-NEXT:    v_mad_f32 v3, -v4, v1, v3
-; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v1|
-; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
+; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], 24
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s2
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT:    s_xor_b32 s3, s2, s4
+; GCN-NEXT:    s_ashr_i32 s3, s3, 30
+; GCN-NEXT:    s_or_b32 s3, s3, 1
+; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GCN-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
+; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v0|
+; GCN-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GCN-NEXT:    s_cselect_b32 s3, s3, 0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, s3, v2
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
-; GCN-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -1268,34 +1267,33 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ;
 ; GCN-IR-LABEL: s_test_srem24_48:
 ; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_sext_i32_i16 s9, s9
+; GCN-IR-NEXT:    s_lshr_b64 s[4:5], s[8:9], 24
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s4
 ; GCN-IR-NEXT:    s_sext_i32_i16 s3, s3
-; GCN-IR-NEXT:    s_sext_i32_i16 s5, s5
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-IR-NEXT:    v_alignbit_b32 v0, s5, v0, 24
-; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, v0
-; GCN-IR-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-IR-NEXT:    v_alignbit_b32 v2, s3, v2, 24
-; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v3, v2
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v4, v1
-; GCN-IR-NEXT:    v_xor_b32_e32 v5, v2, v0
-; GCN-IR-NEXT:    v_ashrrev_i32_e32 v5, 30, v5
-; GCN-IR-NEXT:    v_or_b32_e32 v5, 1, v5
-; GCN-IR-NEXT:    v_mul_f32_e32 v4, v3, v4
-; GCN-IR-NEXT:    v_trunc_f32_e32 v4, v4
-; GCN-IR-NEXT:    v_mad_f32 v3, -v4, v1, v3
-; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v1|
-; GCN-IR-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
+; GCN-IR-NEXT:    s_lshr_b64 s[2:3], s[2:3], 24
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s2
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-IR-NEXT:    s_xor_b32 s3, s2, s4
+; GCN-IR-NEXT:    s_ashr_i32 s3, s3, 30
+; GCN-IR-NEXT:    s_or_b32 s3, s3, 1
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
+; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v0|
+; GCN-IR-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; GCN-IR-NEXT:    s_cselect_b32 s3, s3, 0
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, s3, v2
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, v1, v0
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
-; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dword v0, off, s[4:7], 0
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index dc11e81476a7e..1ed04f8782d5d 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -721,16 +721,14 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_and_b32 s2, s2, 0xff000000
-; GCN-NEXT:    s_and_b32 s4, s4, 0xff000000
-; GCN-NEXT:    s_and_b32 s5, s5, 0xffff
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_alignbit_b32 v0, s5, v0, 24
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GCN-NEXT:    s_and_b32 s3, s3, 0xffff
-; GCN-NEXT:    v_mov_b32_e32 v1, s2
-; GCN-NEXT:    v_alignbit_b32 v1, s3, v1, 24
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GCN-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-NEXT:    s_and_b32 s4, s4, 0xff000000
+; GCN-NEXT:    s_lshr_b64 s[4:5], s[4:5], 24
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GCN-NEXT:    s_and_b32 s2, s2, 0xff000000
+; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], 24
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s2
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
@@ -753,16 +751,14 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_and_b32 s2, s2, 0xff000000
-; GCN-IR-NEXT:    s_and_b32 s4, s4, 0xff000000
-; GCN-IR-NEXT:    s_and_b32 s5, s5, 0xffff
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-IR-NEXT:    v_alignbit_b32 v0, s5, v0, 24
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GCN-IR-NEXT:    s_and_b32 s3, s3, 0xffff
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s2
-; GCN-IR-NEXT:    v_alignbit_b32 v1, s3, v1, 24
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GCN-IR-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-IR-NEXT:    s_and_b32 s4, s4, 0xff000000
+; GCN-IR-NEXT:    s_lshr_b64 s[4:5], s[4:5], 24
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GCN-IR-NEXT:    s_and_b32 s2, s2, 0xff000000
+; GCN-IR-NEXT:    s_lshr_b64 s[2:3], s[2:3], 24
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, s2
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-alignbit.ll b/llvm/test/CodeGen/AMDGPU/uniform-alignbit.ll
new file mode 100644
index 0000000000000..fe8c90ee7b686
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/uniform-alignbit.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GCN %s
+
+define amdgpu_kernel void @uniform_build_vector(i64 %in, ptr addrspace(1) %out) {
+; GCN-LABEL: uniform_build_vector:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b64 s[4:5], s[0:1], 1
+; GCN-NEXT:    s_mov_b32 s5, 0
+; GCN-NEXT:    s_mov_b32 s6, s5
+; GCN-NEXT:    s_mov_b32 s7, s5
+; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    ; sched_barrier mask(0x00000000)
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GCN-NEXT:    global_store_dword v1, v0, s[2:3]
+; GCN-NEXT:    s_endpgm
+entry:
+  %shifted = lshr i64 %in, 1
+  %trunc = trunc i64 %shifted to i32
+  %insert = insertelement <4 x i32> zeroinitializer, i32 %trunc, i64 0
+  %load = tail call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %insert, i32 0, i32 0, i32 0)
+  tail call void @llvm.amdgcn.sched.barrier(i32 0)
+  %extract = extractelement <4 x i32> %load, i64 0
+  %and = and i32 %extract, 1
+  %convert = sitofp i32 %and to float
+  store float %convert, ptr addrspace(1) %out
+  ret void
+}
+
+; Function Attrs: convergent nocallback nofree nounwind willreturn
+declare void @llvm.amdgcn.sched.barrier(i32 immarg) #0
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
+declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32 immarg) #1
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index dc25caadb99a9..b846ce7f12466 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -1033,30 +1033,30 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mul_hi_u32 v3, s2, v2
 ; GCN-NEXT:    v_mul_hi_u32 v2, s3, v2
-; GCN-NEXT:    v_mul_hi_u32 v1, s2, v0
 ; GCN-NEXT:    s_mul_i32 s5, s3, 0xaaaaaaab
+; GCN-NEXT:    v_mul_hi_u32 v1, s2, v0
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, s5, v3
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mul_i32 s0, s2, 0xaaaaaaaa
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, s0, v3
-; GCN-NEXT:    v_mul_hi_u32 v0, s3, v0
+; GCN-NEXT:    v_mul_hi_u32 v3, s3, v0
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-NEXT:    s_mul_i32 s0, s3, 0xaaaaaaaa
 ; GCN-NEXT:    v_addc_u32_e64 v2, s[8:9], 0, 0, vcc
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, s0, v1
-; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v0, v2, vcc
-; GCN-NEXT:    v_alignbit_b32 v1, v0, v1, 4
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, 4, v0
-; GCN-NEXT:    v_mul_hi_u32 v2, v1, 24
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v1
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v3, v2, vcc
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 4, v1
+; GCN-NEXT:    v_lshr_b64 v[0:1], v[0:1], 4
+; GCN-NEXT:    v_mul_lo_u32 v1, v2, 24
+; GCN-NEXT:    v_mul_hi_u32 v2, v0, 24
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, 24
-; GCN-NEXT:    v_mul_lo_u32 v1, v1, 24
-; GCN-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v1
-; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v3, v2, vcc
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
+; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;



More information about the llvm-commits mailing list